From: <syn...@us...> - 2008-05-25 22:39:55
|
Revision: 2627 http://clucene.svn.sourceforge.net/clucene/?rev=2627&view=rev Author: synhershko Date: 2008-05-25 15:39:54 -0700 (Sun, 25 May 2008) Log Message: ----------- Some previously committed fixes were left out, since this branch was created from the downloads section instead of trunk. Modified Paths: -------------- branches/lucene2_3_2/src/CLucene/LuceneThreads.h branches/lucene2_3_2/src/CLucene/analysis/Analyzers.cpp branches/lucene2_3_2/src/CLucene/index/IndexWriter.cpp branches/lucene2_3_2/src/CLucene/index/Term.cpp branches/lucene2_3_2/src/CLucene/index/TermInfo.cpp branches/lucene2_3_2/src/CLucene/search/Hits.cpp branches/lucene2_3_2/src/CLucene/store/FSDirectory.cpp branches/lucene2_3_2/src/CLucene/util/dirent.cpp Modified: branches/lucene2_3_2/src/CLucene/LuceneThreads.h =================================================================== --- branches/lucene2_3_2/src/CLucene/LuceneThreads.h 2008-05-25 22:03:18 UTC (rev 2626) +++ branches/lucene2_3_2/src/CLucene/LuceneThreads.h 2008-05-25 22:39:54 UTC (rev 2627) @@ -12,7 +12,6 @@ #if defined(_CL_DISABLE_MULTITHREADING) #define SCOPED_LOCK_MUTEX(theMutex) - #define SCOPED_LOCK_MUTEX_EX(theMutex,mutexname,dummy) #define DEFINE_MUTEX(x) #define STATIC_DEFINE_MUTEX(x) #define _LUCENE_SLEEP(x) Modified: branches/lucene2_3_2/src/CLucene/analysis/Analyzers.cpp =================================================================== --- branches/lucene2_3_2/src/CLucene/analysis/Analyzers.cpp 2008-05-25 22:03:18 UTC (rev 2626) +++ branches/lucene2_3_2/src/CLucene/analysis/Analyzers.cpp 2008-05-25 22:39:54 UTC (rev 2627) @@ -1,388 +1,388 @@ -/*------------------------------------------------------------------------------ -* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team -* -* Distributable under the terms of either the Apache License (Version 2.0) or -* the GNU Lesser General Public License, as specified in the COPYING file. -------------------------------------------------------------------------------*/ -#include "CLucene/StdHeader.h" -#include "Analyzers.h" -#include "CLucene/util/StringBuffer.h" - -CL_NS_USE(util) -CL_NS_DEF(analysis) - -CharTokenizer::CharTokenizer(Reader* in) : - Tokenizer(in), - offset(0), - bufferIndex(0), - dataLen(0), - ioBuffer(NULL) -{ - buffer[0]=0; -} - -TCHAR CharTokenizer::normalize(const TCHAR c) const -{ - return c; -} -bool CharTokenizer::next(Token* token){ - int32_t length = 0; - int32_t start = offset; - while (true) { - TCHAR c; - offset++; - if (bufferIndex >= dataLen) { - dataLen = input->read(ioBuffer, LUCENE_IO_BUFFER_SIZE); - if (dataLen == -1) - dataLen = 0; - bufferIndex = 0; - } - if (dataLen <= 0 ) { - if (length > 0) - break; - else - return false; - }else - c = ioBuffer[bufferIndex++]; - if (isTokenChar(c)) { // if it's a token TCHAR - - if (length == 0) // start of token - start = offset-1; - - buffer[length++] = normalize(c); // buffer it, normalized - - if (length == LUCENE_MAX_WORD_LEN) // buffer overflow! - break; - - } else if (length > 0) // at non-Letter w/ chars - break; // return 'em - - } - buffer[length]=0; - token->set( buffer, start, start+length); - return true; -} - -bool LetterTokenizer::isTokenChar(const TCHAR c) const { - return _istalpha(c)!=0; -} - - -TCHAR LowerCaseTokenizer::normalize(const TCHAR chr) const { - return _totlower(chr); -} - -bool WhitespaceTokenizer::isTokenChar(const TCHAR c) const{ - return _istspace(c)==0; //(return true if NOT a space) -} - -TokenStream* WhitespaceAnalyzer::tokenStream(const TCHAR* fieldName, Reader* reader) { - return _CLNEW WhitespaceTokenizer(reader); -} - -TokenStream* SimpleAnalyzer::tokenStream(const TCHAR* fieldName, Reader* reader) { - return _CLNEW LowerCaseTokenizer(reader); -} - -bool LowerCaseFilter::next(Token* t){ - if (!input->next(t)) - return false; - stringCaseFold( t->_termText ); - return true; -} - -StopFilter::StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** stopWords): - TokenFilter(in, deleteTokenStream), - table(_CLNEW CLSetList<const TCHAR*>(false)) -{ - fillStopTable( table,stopWords ); -} - -void StopFilter::fillStopTable(CLSetList<const TCHAR*>* stopTable, - const TCHAR** stopWords) { - for (int32_t i = 0; stopWords[i]!=NULL; i++) - stopTable->insert(stopWords[i]); -} - -bool StopFilter::next(Token* token) { - // return the first non-stop word found - while (input->next(token)){ - if (table->find(token->_termText)==table->end()){ - return true; - } - } - - // reached EOS -- return nothing - return false; -} - -StopAnalyzer::StopAnalyzer():stopTable(false) -{ - StopFilter::fillStopTable(&stopTable,ENGLISH_STOP_WORDS); -} -StopAnalyzer::~StopAnalyzer() -{ -} -StopAnalyzer::StopAnalyzer( const TCHAR** stopWords) { - StopFilter::fillStopTable(&stopTable,stopWords); -} -TokenStream* StopAnalyzer::tokenStream(const TCHAR* fieldName, Reader* reader) { - return _CLNEW StopFilter(_CLNEW LowerCaseTokenizer(reader),true, &stopTable); -} - -const TCHAR* StopAnalyzer::ENGLISH_STOP_WORDS[] = -{ - _T("a"), _T("an"), _T("and"), _T("are"), _T("as"), _T("at"), _T("be"), _T("but"), _T("by"), - _T("for"), _T("if"), _T("in"), _T("into"), _T("is"), _T("it"), - _T("no"), _T("not"), _T("of"), _T("on"), _T("or"), _T("s"), _T("such"), - _T("t"), _T("that"), _T("the"), _T("their"), _T("then"), _T("there"), _T("these"), - _T("they"), _T("this"), _T("to"), _T("was"), _T("will"), _T("with"), NULL -}; - -PerFieldAnalyzerWrapper::PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer): - analyzerMap(true,true) -{ - this->defaultAnalyzer = defaultAnalyzer; -} -PerFieldAnalyzerWrapper::~PerFieldAnalyzerWrapper(){ - analyzerMap.clear(); - _CLDELETE(defaultAnalyzer); -} - -void PerFieldAnalyzerWrapper::addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer) { - analyzerMap.put(STRDUP_TtoT(fieldName), analyzer); -} - -TokenStream* PerFieldAnalyzerWrapper::tokenStream(const TCHAR* fieldName, Reader* reader) { - Analyzer* analyzer = (fieldName==NULL?defaultAnalyzer:analyzerMap.get(fieldName)); - if (analyzer == NULL) { - analyzer = defaultAnalyzer; - } - - return analyzer->tokenStream(fieldName, reader); -} - - - -bool ISOLatin1AccentFilter::next(Token* token){ - if ( input->next(token) ){ - int32_t l = token->termTextLength(); - const TCHAR* chars = token->termText(); - bool doProcess = false; - for (int i = 0; i < l; ++i) { - #ifdef _UCS2 - if ( chars[i] >= 0xC0 && chars[i] <= 0x178 ) { - #else - if ( (chars[i] >= 0xC0 && chars[i] <= 0xFF) || chars[i] < 0 ) { - #endif - doProcess = true; - break; - } - - } - if ( !doProcess ) { - return true; - } - - StringBuffer output(l*2); - for (int32_t j = 0; j < l; j++) { - #ifdef _UCS2 - TCHAR c = chars[j]; - #else - unsigned char c = chars[j]; - #endif - switch (c) { - case 0xC0 : // \xC0 - case 0xC1 : // \xC1 - case 0xC2 : // \xC2 - case 0xC3 : // \xC3 - case 0xC4 : // \xC4 - case 0xC5 : // \xC5 - output.appendChar('A'); - break; - case 0xC6 : // \xC6 - output.append(_T("AE")); - break; - case 0xC7 : // \xC7 - output.appendChar('C'); - break; - case 0xC8 : // \xC8 - case 0xC9 : // \xC9 - case 0xCA : // \xCA - case 0xCB : // \xCB - output.appendChar('E'); - break; - case 0xCC : // \xCC - case 0xCD : // \xCD - case 0xCE : // \xCE - case 0xCF : // \xCF - output.appendChar('I'); - break; - case 0xD0 : // \xD0 - output.appendChar('D'); - break; - case 0xD1 : // \xD1 - output.appendChar('N'); - break; - case 0xD2 : // \xD2 - case 0xD3 : // \xD3 - case 0xD4 : // \xD4 - case 0xD5 : // \xD5 - case 0xD6 : // \xD6 - case 0xD8 : // \xD8 - output.appendChar('O'); - break; - case 0xDE : // \xDE - output.append(_T("TH")); - break; - case 0xD9 : // \xD9 - case 0xDA : // \xDA - case 0xDB : // \xDB - case 0xDC : // \xDC - output.appendChar('U'); - break; - case 0xDD : // \xDD - output.appendChar('Y'); - break; - case 0xE0 : // \xE0 - case 0xE1 : // \xE1 - case 0xE2 : // \xE2 - case 0xE3 : // \xE3 - case 0xE4 : // \xE4 - case 0xE5 : // \xE5 - output.appendChar('a'); - break; - case 0xE6 : // \xE6 - output.append(_T("ae")); - break; - case 0xE7 : // \xE7 - output.appendChar('c'); - break; - case 0xE8 : // \xE8 - case 0xE9 : // \xE9 - case 0xEA : // \xEA - case 0xEB : // \xEB - output.appendChar('e'); - break; - case 0xEC : // \xEC - case 0xED : // \xED - case 0xEE : // \xEE - case 0xEF : // \xEF - output.appendChar('i'); - break; - case 0xF0 : // \xF0 - output.appendChar('d'); - break; - case 0xF1 : // \xF1 - output.appendChar('n'); - break; - case 0xF2 : // \xF2 - case 0xF3 : // \xF3 - case 0xF4 : // \xF4 - case 0xF5 : // \xF5 - case 0xF6 : // \xF6 - case 0xF8 : // \xF8 - output.appendChar('o'); - break; - case 0xDF : // \xDF - output.append(_T("ss")); - break; - case 0xFE : // \xFE - output.append(_T("th")); - break; - case 0xF9 : // \xF9 - case 0xFA : // \xFA - case 0xFB : // \xFB - case 0xFC : // \xFC - output.appendChar('u'); - break; - case 0xFD : // \xFD - case 0xFF : // \xFF - output.appendChar('y'); - break; - - #ifdef _UCS2 - case 0x152 : // \x8C - output.append(_T("OE")); - break; - case 0x153 : // \x9C - output.append(_T("oe")); - break; - case 0x178 : // \x9F - output.appendChar('Y'); - break; - #endif - default : - output.appendChar(c); - break; - } - } - token->setText(output.getBuffer()); - return true; - } - return false; -} - - -TokenStream* KeywordAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader){ - return _CLNEW KeywordTokenizer(reader); -} - -KeywordTokenizer::KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize): - Tokenizer(input) -{ - this->done = false; - if ( bufferSize < 0 ) - this->bufferSize = DEFAULT_BUFFER_SIZE; -} -KeywordTokenizer::~KeywordTokenizer(){ -} - -bool KeywordTokenizer::next(Token* token){ - if (!done) { - done = true; - int32_t rd; - const TCHAR* buffer=0; - while (true) { - rd = input->read(buffer, bufferSize); - if (rd == -1) - break; - token->growBuffer(token->_termTextLen +rd+1); - - int32_t cp = rd; - if ( token->_termTextLen + cp > token->bufferLength() ) - cp = token->bufferLength() - token->_termTextLen; - _tcsncpy(token->_termText+token->_termTextLen,buffer,cp); - token->_termTextLen+=rd; - } - token->_termText[token->_termTextLen]=0; - token->set(token->_termText,0,token->_termTextLen); - return true; - } - return false; -} - - -LengthFilter::LengthFilter(TokenStream* in, int _min, int _max): - TokenFilter(in) -{ - this->_min = _min; - this->_max = _max; -} - -bool LengthFilter::next(Token* token) -{ - // return the first non-stop word found - while ( input->next(token) ) - { - size_t len = token->termTextLength(); - if (len >= _min && len <= _max) - return true; - // note: else we ignore it but should we index each part of it? - } - // reached EOS -- return null - return false; -} - - -CL_NS_END +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "Analyzers.h" +#include "CLucene/util/StringBuffer.h" + +CL_NS_USE(util) +CL_NS_DEF(analysis) + +CharTokenizer::CharTokenizer(Reader* in) : + Tokenizer(in), + offset(0), + bufferIndex(0), + dataLen(0), + ioBuffer(NULL) +{ + buffer[0]=0; +} + +TCHAR CharTokenizer::normalize(const TCHAR c) const +{ + return c; +} +bool CharTokenizer::next(Token* token){ + int32_t length = 0; + int32_t start = offset; + while (true) { + TCHAR c; + offset++; + if (bufferIndex >= dataLen) { + dataLen = input->read(ioBuffer, LUCENE_IO_BUFFER_SIZE); + if (dataLen == -1) + dataLen = 0; + bufferIndex = 0; + } + if (dataLen <= 0 ) { + if (length > 0) + break; + else + return false; + }else + c = ioBuffer[bufferIndex++]; + if (isTokenChar(c)) { // if it's a token TCHAR + + if (length == 0) // start of token + start = offset-1; + + buffer[length++] = normalize(c); // buffer it, normalized + + if (length == LUCENE_MAX_WORD_LEN) // buffer overflow! + break; + + } else if (length > 0) // at non-Letter w/ chars + break; // return 'em + + } + buffer[length]=0; + token->set( buffer, start, start+length); + return true; +} + +bool LetterTokenizer::isTokenChar(const TCHAR c) const { + return _istalpha(c)!=0; +} + + +TCHAR LowerCaseTokenizer::normalize(const TCHAR chr) const { + return _totlower(chr); +} + +bool WhitespaceTokenizer::isTokenChar(const TCHAR c) const{ + return _istspace(c)==0; //(return true if NOT a space) +} + +TokenStream* WhitespaceAnalyzer::tokenStream(const TCHAR* fieldName, Reader* reader) { + return _CLNEW WhitespaceTokenizer(reader); +} + +TokenStream* SimpleAnalyzer::tokenStream(const TCHAR* fieldName, Reader* reader) { + return _CLNEW LowerCaseTokenizer(reader); +} + +bool LowerCaseFilter::next(Token* t){ + if (!input->next(t)) + return false; + stringCaseFold( t->_termText ); + return true; +} + +StopFilter::StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** stopWords): + TokenFilter(in, deleteTokenStream), + table(_CLNEW CLSetList<const TCHAR*>(false)) +{ + fillStopTable( table,stopWords ); +} + +void StopFilter::fillStopTable(CLSetList<const TCHAR*>* stopTable, + const TCHAR** stopWords) { + for (int32_t i = 0; stopWords[i]!=NULL; i++) + stopTable->insert(stopWords[i]); +} + +bool StopFilter::next(Token* token) { + // return the first non-stop word found + while (input->next(token)){ + if (table->find(token->_termText)==table->end()){ + return true; + } + } + + // reached EOS -- return nothing + return false; +} + +StopAnalyzer::StopAnalyzer():stopTable(false) +{ + StopFilter::fillStopTable(&stopTable,ENGLISH_STOP_WORDS); +} +StopAnalyzer::~StopAnalyzer() +{ +} +StopAnalyzer::StopAnalyzer( const TCHAR** stopWords) { + StopFilter::fillStopTable(&stopTable,stopWords); +} +TokenStream* StopAnalyzer::tokenStream(const TCHAR* fieldName, Reader* reader) { + return _CLNEW StopFilter(_CLNEW LowerCaseTokenizer(reader),true, &stopTable); +} + +const TCHAR* StopAnalyzer::ENGLISH_STOP_WORDS[] = +{ + _T("a"), _T("an"), _T("and"), _T("are"), _T("as"), _T("at"), _T("be"), _T("but"), _T("by"), + _T("for"), _T("if"), _T("in"), _T("into"), _T("is"), _T("it"), + _T("no"), _T("not"), _T("of"), _T("on"), _T("or"), _T("s"), _T("such"), + _T("t"), _T("that"), _T("the"), _T("their"), _T("then"), _T("there"), _T("these"), + _T("they"), _T("this"), _T("to"), _T("was"), _T("will"), _T("with"), NULL +}; + +PerFieldAnalyzerWrapper::PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer): + analyzerMap(true,true) +{ + this->defaultAnalyzer = defaultAnalyzer; +} +PerFieldAnalyzerWrapper::~PerFieldAnalyzerWrapper(){ + analyzerMap.clear(); + _CLDELETE(defaultAnalyzer); +} + +void PerFieldAnalyzerWrapper::addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer) { + analyzerMap.put(STRDUP_TtoT(fieldName), analyzer); +} + +TokenStream* PerFieldAnalyzerWrapper::tokenStream(const TCHAR* fieldName, Reader* reader) { + Analyzer* analyzer = (fieldName==NULL?defaultAnalyzer:analyzerMap.get(fieldName)); + if (analyzer == NULL) { + analyzer = defaultAnalyzer; + } + + return analyzer->tokenStream(fieldName, reader); +} + + + +bool ISOLatin1AccentFilter::next(Token* token){ + if ( input->next(token) ){ + int32_t l = token->termTextLength(); + const TCHAR* chars = token->termText(); + bool doProcess = false; + for (int32_t i = 0; i < l; ++i) { + #ifdef _UCS2 + if ( chars[i] >= 0xC0 && chars[i] <= 0x178 ) { + #else + if ( (chars[i] >= 0xC0 && chars[i] <= 0xFF) || chars[i] < 0 ) { + #endif + doProcess = true; + break; + } + + } + if ( !doProcess ) { + return true; + } + + StringBuffer output(l*2); + for (int32_t j = 0; j < l; j++) { + #ifdef _UCS2 + TCHAR c = chars[j]; + #else + unsigned char c = chars[j]; + #endif + switch (c) { + case 0xC0 : // \xC0 + case 0xC1 : // \xC1 + case 0xC2 : // \xC2 + case 0xC3 : // \xC3 + case 0xC4 : // \xC4 + case 0xC5 : // \xC5 + output.appendChar('A'); + break; + case 0xC6 : // \xC6 + output.append(_T("AE")); + break; + case 0xC7 : // \xC7 + output.appendChar('C'); + break; + case 0xC8 : // \xC8 + case 0xC9 : // \xC9 + case 0xCA : // \xCA + case 0xCB : // \xCB + output.appendChar('E'); + break; + case 0xCC : // \xCC + case 0xCD : // \xCD + case 0xCE : // \xCE + case 0xCF : // \xCF + output.appendChar('I'); + break; + case 0xD0 : // \xD0 + output.appendChar('D'); + break; + case 0xD1 : // \xD1 + output.appendChar('N'); + break; + case 0xD2 : // \xD2 + case 0xD3 : // \xD3 + case 0xD4 : // \xD4 + case 0xD5 : // \xD5 + case 0xD6 : // \xD6 + case 0xD8 : // \xD8 + output.appendChar('O'); + break; + case 0xDE : // \xDE + output.append(_T("TH")); + break; + case 0xD9 : // \xD9 + case 0xDA : // \xDA + case 0xDB : // \xDB + case 0xDC : // \xDC + output.appendChar('U'); + break; + case 0xDD : // \xDD + output.appendChar('Y'); + break; + case 0xE0 : // \xE0 + case 0xE1 : // \xE1 + case 0xE2 : // \xE2 + case 0xE3 : // \xE3 + case 0xE4 : // \xE4 + case 0xE5 : // \xE5 + output.appendChar('a'); + break; + case 0xE6 : // \xE6 + output.append(_T("ae")); + break; + case 0xE7 : // \xE7 + output.appendChar('c'); + break; + case 0xE8 : // \xE8 + case 0xE9 : // \xE9 + case 0xEA : // \xEA + case 0xEB : // \xEB + output.appendChar('e'); + break; + case 0xEC : // \xEC + case 0xED : // \xED + case 0xEE : // \xEE + case 0xEF : // \xEF + output.appendChar('i'); + break; + case 0xF0 : // \xF0 + output.appendChar('d'); + break; + case 0xF1 : // \xF1 + output.appendChar('n'); + break; + case 0xF2 : // \xF2 + case 0xF3 : // \xF3 + case 0xF4 : // \xF4 + case 0xF5 : // \xF5 + case 0xF6 : // \xF6 + case 0xF8 : // \xF8 + output.appendChar('o'); + break; + case 0xDF : // \xDF + output.append(_T("ss")); + break; + case 0xFE : // \xFE + output.append(_T("th")); + break; + case 0xF9 : // \xF9 + case 0xFA : // \xFA + case 0xFB : // \xFB + case 0xFC : // \xFC + output.appendChar('u'); + break; + case 0xFD : // \xFD + case 0xFF : // \xFF + output.appendChar('y'); + break; + + #ifdef _UCS2 + case 0x152 : // \x8C + output.append(_T("OE")); + break; + case 0x153 : // \x9C + output.append(_T("oe")); + break; + case 0x178 : // \x9F + output.appendChar('Y'); + break; + #endif + default : + output.appendChar(c); + break; + } + } + token->setText(output.getBuffer()); + return true; + } + return false; +} + + +TokenStream* KeywordAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader){ + return _CLNEW KeywordTokenizer(reader); +} + +KeywordTokenizer::KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize): + Tokenizer(input) +{ + this->done = false; + if ( bufferSize < 0 ) + this->bufferSize = DEFAULT_BUFFER_SIZE; +} +KeywordTokenizer::~KeywordTokenizer(){ +} + +bool KeywordTokenizer::next(Token* token){ + if (!done) { + done = true; + int32_t rd; + const TCHAR* buffer=0; + while (true) { + rd = input->read(buffer, bufferSize); + if (rd == -1) + break; + token->growBuffer(token->_termTextLen +rd+1); + + int32_t cp = rd; + if ( token->_termTextLen + cp > token->bufferLength() ) + cp = token->bufferLength() - token->_termTextLen; + _tcsncpy(token->_termText+token->_termTextLen,buffer,cp); + token->_termTextLen+=rd; + } + token->_termText[token->_termTextLen]=0; + token->set(token->_termText,0,token->_termTextLen); + return true; + } + return false; +} + + +LengthFilter::LengthFilter(TokenStream* in, int _min, int _max): + TokenFilter(in) +{ + this->_min = _min; + this->_max = _max; +} + +bool LengthFilter::next(Token* token) +{ + // return the first non-stop word found + while ( input->next(token) ) + { + size_t len = token->termTextLength(); + if (len >= _min && len <= _max) + return true; + // note: else we ignore it but should we index each part of it? + } + // reached EOS -- return null + return false; +} + + +CL_NS_END Modified: branches/lucene2_3_2/src/CLucene/index/IndexWriter.cpp =================================================================== --- branches/lucene2_3_2/src/CLucene/index/IndexWriter.cpp 2008-05-25 22:03:18 UTC (rev 2626) +++ branches/lucene2_3_2/src/CLucene/index/IndexWriter.cpp 2008-05-25 22:39:54 UTC (rev 2627) @@ -1,686 +1,690 @@ -/*------------------------------------------------------------------------------ -* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team -* -* Distributable under the terms of either the Apache License (Version 2.0) or -* the GNU Lesser General Public License, as specified in the COPYING file. -------------------------------------------------------------------------------*/ -#include "CLucene/StdHeader.h" -#include "IndexWriter.h" - -#include "CLucene/document/Document.h" -#include "CLucene/store/Directory.h" -#include "CLucene/store/Lock.h" -#include "CLucene/util/VoidList.h" -#include "DocumentWriter.h" -#include "SegmentInfos.h" -#include "SegmentMerger.h" - -CL_NS_USE(store) -CL_NS_USE(util) -CL_NS_USE(document) -CL_NS_USE(analysis) -CL_NS_DEF(index) - - - const char* IndexWriter::WRITE_LOCK_NAME = "write.lock"; - const char* IndexWriter::COMMIT_LOCK_NAME = "commit.lock"; - - IndexWriter::IndexWriter(const char* path, Analyzer* a, const bool create, const bool _closeDir): - directory( FSDirectory::getDirectory(path, create) ), - analyzer(a), - segmentInfos (_CLNEW SegmentInfos), - closeDir(_closeDir){ - //Func - Constructor - // Constructs an IndexWriter for the index in path. - //Pre - path != NULL and contains a named directory path - // a holds a valid reference to an analyzer and analyzes the text to be indexed - // create indicates if the indexWriter must create a new index located at path or just open it - //Post - If create is true, then a new, empty index has been created in path, replacing the index - // already there, if any. The named directory path is owned by this Instance - - CND_PRECONDITION(path != NULL, "path is NULL"); - - //Continue initializing the instance by _IndexWriter - _IndexWriter ( create ); - } - - IndexWriter::IndexWriter(Directory* d, Analyzer* a, const bool create, const bool _closeDir): - directory(_CL_POINTER(d)), - analyzer(a), - segmentInfos (_CLNEW SegmentInfos), - closeDir(_closeDir) - { - //Func - Constructor - // Constructs an IndexWriter for the index in path. - //Pre - d contains a valid reference to a directory - // a holds a valid reference to an analyzer and analyzes the text to be indexed - // create indicates if the indexWriter must create a new index located at path or just open it - //Post - If create is true, then a new, empty index has been created in path, replacing the index - // already there, if any. The directory d is not owned by this Instance - - //Continue initializing the instance by _IndexWriter - _IndexWriter ( create ); - } - - void IndexWriter::_IndexWriter(const bool create){ - //Func - Initialises the instances - //Pre - create indicates if the indexWriter must create a new index located at path or just open it - //Post - - - similarity = CL_NS(search)::Similarity::getDefault(); - - useCompoundFile = true; - if ( directory->getDirectoryType() == RAMDirectory::DirectoryType() ) - useCompoundFile = false; - - //Create a ramDirectory - ramDirectory = _CLNEW TransactionalRAMDirectory; - - CND_CONDITION(ramDirectory != NULL,"ramDirectory is NULL"); - - //Initialize the writeLock to - writeLock = NULL; - - //initialise the settings... - maxFieldLength = DEFAULT_MAX_FIELD_LENGTH; - mergeFactor = DEFAULT_MERGE_FACTOR; - maxMergeDocs = DEFAULT_MAX_MERGE_DOCS; - writeLockTimeout = WRITE_LOCK_TIMEOUT; - commitLockTimeout = COMMIT_LOCK_TIMEOUT; - minMergeDocs = DEFAULT_MAX_BUFFERED_DOCS; - termIndexInterval = DEFAULT_TERM_INDEX_INTERVAL; - - //Create a new lock using the name "write.lock" - LuceneLock* newLock = directory->makeLock(IndexWriter::WRITE_LOCK_NAME); - - //Condition check to see if newLock has been allocated properly - CND_CONDITION(newLock != NULL, "No memory could be allocated for LuceneLock newLock"); - - //Try to obtain a write lock - if (!newLock->obtain(writeLockTimeout)){ - //Write lock could not be obtained so delete it - _CLDELETE(newLock); - //Reset the instance - _finalize(); - //throw an exception because no writelock could be created or obtained - _CLTHROWA(CL_ERR_IO, "Index locked for write or no write access." ); - } - - //The Write Lock has been obtained so save it for later use - this->writeLock = newLock; - - //Create a new lock using the name "commit.lock" - LuceneLock* lock = directory->makeLock(IndexWriter::COMMIT_LOCK_NAME); - - //Condition check to see if lock has been allocated properly - CND_CONDITION(lock != NULL, "No memory could be allocated for LuceneLock lock"); - - LockWith2 with ( lock,commitLockTimeout,this, NULL, create ); - { - SCOPED_LOCK_MUTEX(directory->THIS_LOCK) // in- & inter-process sync - with.run(); - } - - //Release the commit lock - _CLDELETE(lock); - - isOpen = true; - } - - void IndexWriter::_finalize(){ - //Func - Releases all the resources of the instance - //Pre - true - //Post - All the releases have been released - - if(writeLock != NULL){ - //release write lock - writeLock->release(); - _CLDELETE( writeLock ); - } - - //Delete the ramDirectory - if ( ramDirectory != NULL ){ - ramDirectory->close(); - _CLDECDELETE(ramDirectory); - } - - //clear segmentInfos and delete it - _CLDELETE(segmentInfos); - - } - - IndexWriter::~IndexWriter() { - //Func - Destructor - //Pre - true - //Post - The instance has been destroyed - close(); - _finalize(); - } - - - void IndexWriter::close( ) { - //Func - Flushes all changes to an index, closes all associated files, and closes - // the directory that the index is stored in. - //Pre - closeDir indicates if the directory must be closed or not - //Post - All the changes have been flushed to disk and the write lock has been released - // The ramDirectory has also been closed. The directory has been closed - // if the reference count of the directory reaches zero - - SCOPED_LOCK_MUTEX(THIS_LOCK) - if ( isOpen ){ - //Flush the Ram Segments - flushRamSegments(); - //Close the ram directory - if ( ramDirectory != NULL ){ - ramDirectory->close(); - _CLDECDELETE(ramDirectory); - } - - //Check if this instance must close the directory - if ( closeDir ){ - directory->close(); - } - _CLDECDELETE(directory); - - // release write lock - if (writeLock != NULL){ - writeLock->release(); - _CLDELETE( writeLock ); - } - - isOpen = false; - } - } - - - int32_t IndexWriter::docCount(){ - //Func - Counts the number of documents in the index - //Pre - true - //Post - The number of documents have been returned - - SCOPED_LOCK_MUTEX(THIS_LOCK) - - //Initialize count - int32_t count = 0; - - //Iterate through all segmentInfos - for (int32_t i = 0; i < segmentInfos->size(); i++) { - //Get the i-th SegmentInfo - SegmentInfo* si = segmentInfos->info(i); - //Retrieve the number of documents of the segment and add it to count - count += si->docCount; - } - return count; - } - - void IndexWriter::addDocument(Document* doc, Analyzer* analyzer) { - //Func - Adds a document to the index - //Pre - doc contains a valid reference to a document - // ramDirectory != NULL - //Post - The document has been added to the index of this IndexWriter - CND_PRECONDITION(ramDirectory != NULL,"ramDirectory is NULL"); - - if ( analyzer == NULL ) - analyzer = this->analyzer; - - ramDirectory->transStart(); - try { - char* segmentName = newSegmentName(); - CND_CONDITION(segmentName != NULL, "segmentName is NULL"); - try { - //Create the DocumentWriter using a ramDirectory and analyzer - // supplied by the IndexWriter (this). - DocumentWriter* dw = _CLNEW DocumentWriter( - ramDirectory, analyzer, this ); - CND_CONDITION(dw != NULL, "dw is NULL"); - try { - //Add the client-supplied document to the new segment. - dw->addDocument(segmentName, doc); - } _CLFINALLY( - _CLDELETE(dw); - ); - - //Create a new SegmentInfo instance about this new segment. - SegmentInfo* si = _CLNEW SegmentInfo(segmentName, 1, ramDirectory); - CND_CONDITION(si != NULL, "Si is NULL"); - - { - SCOPED_LOCK_MUTEX(THIS_LOCK) - - //Add the info object for this particular segment to the list - // of all segmentInfos-> - segmentInfos->add(si); - - //Check to see if the segments must be merged - maybeMergeSegments(); - } - } _CLFINALLY( - _CLDELETE_CaARRAY(segmentName); - ); - - } catch (...) { - ramDirectory->transAbort(); - throw; - } - ramDirectory->transCommit(); - } - - - void IndexWriter::optimize() { - //Func - Optimizes the index for which this Instance is responsible - //Pre - true - //Post - - SCOPED_LOCK_MUTEX(THIS_LOCK) - //Flush the RamSegments to disk - flushRamSegments(); - while (segmentInfos->size() > 1 || - (segmentInfos->size() == 1 && - (SegmentReader::hasDeletions(segmentInfos->info(0)) || - segmentInfos->info(0)->getDir()!=directory || - (useCompoundFile && - (!SegmentReader::usesCompoundFile(segmentInfos->info(0)) || - SegmentReader::hasSeparateNorms(segmentInfos->info(0))))))) { - - int32_t minSegment = segmentInfos->size() - mergeFactor; - - mergeSegments(minSegment < 0 ? 0 : minSegment); - } - } - - - char* IndexWriter::newSegmentName() { - SCOPED_LOCK_MUTEX(THIS_LOCK) - - TCHAR buf[9]; - _i64tot(segmentInfos->counter++,buf,36); //36 is RADIX of 10 digits and 26 numbers - - int32_t rlen = _tcslen(buf) + 2; - char* ret = _CL_NEWARRAY(char,rlen); - strcpy(ret,"_"); - STRCPY_TtoA(ret+1,buf,rlen-1); //write at 2nd character, for a maximum of 9 characters - return ret; - } - - void IndexWriter::flushRamSegments() { - //Func - Merges all RAM-resident segments. - //Pre - ramDirectory != NULL - //Post - The RAM-resident segments have been merged to disk - - CND_PRECONDITION(ramDirectory != NULL, "ramDirectory is NULL"); - - int32_t minSegment = segmentInfos->size()-1; //don't make this unsigned... - CND_CONDITION(minSegment >= -1, "minSegment must be >= -1"); - - int32_t docCount = 0; - //Iterate through all the segements and check if the directory is a ramDirectory - while (minSegment >= 0 && - segmentInfos->info(minSegment)->getDir() == ramDirectory) { - docCount += segmentInfos->info(minSegment)->docCount; - minSegment--; - } - if (minSegment < 0 || // add one FS segment? - (docCount + segmentInfos->info(minSegment)->docCount) > mergeFactor || - !(segmentInfos->info(segmentInfos->size()-1)->getDir() == ramDirectory)) - minSegment++; - - CND_CONDITION(minSegment >= 0, "minSegment must be >= 0"); - if (minSegment >= segmentInfos->size()) - return; // none to merge - mergeSegments(minSegment); - } - - void IndexWriter::maybeMergeSegments() { - //Func - Incremental Segment Merger - //Pre - - //Post - - - int64_t targetMergeDocs = minMergeDocs; - - // find segments smaller than current target size - while (targetMergeDocs <= maxMergeDocs) { - int32_t minSegment = segmentInfos->size(); - int32_t mergeDocs = 0; - - while (--minSegment >= 0) { - SegmentInfo* si = segmentInfos->info(minSegment); - if (si->docCount >= targetMergeDocs) - break; - mergeDocs += si->docCount; - } - - if (mergeDocs >= targetMergeDocs){ - // found a merge to do - mergeSegments(minSegment+1); - }else - break; - - //increase target size - targetMergeDocs *= mergeFactor; - } - } - - void IndexWriter::mergeSegments(const uint32_t minSegment) { - mergeSegments(minSegment, segmentInfos->size()); - } - - void IndexWriter::mergeSegments(const uint32_t minSegment, const uint32_t end) { - CLVector<SegmentReader*> segmentsToDelete(false); - const char* mergedName = newSegmentName(); -#ifdef _CL_DEBUG_INFO - fprintf(_CL_DEBUG_INFO, "merging segments\n"); -#endif - SegmentMerger merger(this, mergedName); - for (size_t i = minSegment; i < end; i++) { - SegmentInfo* si = segmentInfos->info(i); -#ifdef _CL_DEBUG_INFO - fprintf(_CL_DEBUG_INFO, " %s (%d docs)\n",si->name,si->docCount); -#endif - SegmentReader* reader = _CLNEW SegmentReader(si); - merger.add(reader); - if ((reader->getDirectory() == this->directory) || // if we own the directory - (reader->getDirectory() == this->ramDirectory)){ - segmentsToDelete.push_back(reader); // queue segment for deletion - } - } - - int32_t mergedDocCount = merger.merge(); - -#ifdef _CL_DEBUG_INFO - fprintf(_CL_DEBUG_INFO,"\n into %s (%d docs)\n",mergedName, mergedDocCount); -#endif - - segmentInfos->clearto(minSegment);// remove old infos & add new - segmentInfos->add( _CLNEW SegmentInfo(mergedName, mergedDocCount, directory) ); - - // close readers before we attempt to delete now-obsolete segments - merger.closeReaders(); - - LuceneLock* lock = directory->makeLock(IndexWriter::COMMIT_LOCK_NAME); - LockWith2 with ( lock, commitLockTimeout,this, &segmentsToDelete, true ); - - { - SCOPED_LOCK_MUTEX(directory->THIS_LOCK) // in- & inter-process sync - with.run(); - } - _CLDELETE( lock ); - - - - if (useCompoundFile) { - char cmpdTmpName[CL_MAX_PATH]; - strcpy(cmpdTmpName,mergedName); - strcat(cmpdTmpName,".tmp"); - - AStringArrayWithDeletor filesToDelete; - merger.createCompoundFile(cmpdTmpName, filesToDelete); - - LuceneLock* lock = directory->makeLock(IndexWriter::COMMIT_LOCK_NAME); - LockWithCFS with ( lock,commitLockTimeout,directory, this, mergedName, &filesToDelete); - { - SCOPED_LOCK_MUTEX(directory->THIS_LOCK) // in- & inter-process sync - with.run(); - } - _CLDELETE(lock); - } - - _CLDELETE_CaARRAY( mergedName ); - } - - void IndexWriter::deleteSegments(CLVector<SegmentReader*>* segments) { - AStringArrayWithDeletor deletable; - - {//scope delete deleteArray object - AStringArrayWithDeletor deleteArray; - readDeleteableFiles(deleteArray); - deleteFiles(deleteArray, deletable); // try to delete deleteable - } - - AStringArrayWithDeletor files; - for (uint32_t i = 0; i < segments->size(); i++) { - SegmentReader* reader = (*segments)[i]; - files.clear(); - reader->files(files); - if (reader->getDirectory() == this->directory) - deleteFiles(files, deletable); // try to delete our files - else - deleteFiles(files, reader->getDirectory()); // delete, eg, RAM files - } - - writeDeleteableFiles(deletable); // note files we can't delete - } - - void IndexWriter::readDeleteableFiles(AStringArrayWithDeletor& result) { - if (!directory->fileExists("deletable")) - return; - - IndexInput* input = directory->openInput("deletable"); - try { - TCHAR tname[CL_MAX_PATH]; - for (int32_t i = input->readInt(); i > 0; i--){ // read file names - input->readString(tname,CL_MAX_PATH); - result.push_back(STRDUP_TtoA(tname)); - } - } _CLFINALLY( - input->close(); - _CLDELETE(input); - ); - } - - void IndexWriter::writeDeleteableFiles(AStringArrayWithDeletor& files) { - IndexOutput* output = directory->createOutput("deleteable.new"); - try { - output->writeInt(files.size()); - TCHAR tfile[CL_MAX_PATH]; //temporary space for tchar file name - for (uint32_t i = 0; i < files.size(); i++){ - STRCPY_AtoT(tfile,files[i],CL_MAX_PATH); - output->writeString( tfile, _tcslen(tfile) ); - } - } _CLFINALLY( - output->close(); - _CLDELETE(output); - ); - - directory->renameFile("deleteable.new", "deletable"); - } - - void IndexWriter::deleteFiles(AStringArrayWithDeletor& files){ - AStringArrayWithDeletor deletable; - AStringArrayWithDeletor currDeletable; - readDeleteableFiles(currDeletable); - deleteFiles(currDeletable, deletable); // try to delete deleteable - deleteFiles(files, deletable); // try to delete our files - writeDeleteableFiles(deletable); // note files we can't delete - } - - void IndexWriter::deleteFiles(AStringArrayWithDeletor& files, Directory* directory) { - AStringArrayWithDeletor::iterator itr = files.begin(); - while ( itr != files.end() ){ - directory->deleteFile( *itr, true ); - ++itr; - } - } - - void IndexWriter::deleteFiles(AStringArrayWithDeletor& files, AStringArrayWithDeletor& deletable) { - AStringArrayWithDeletor::iterator itr=files.begin(); - while ( itr != files.end() ){ - const char* file = *itr; - if ( getDirectory()->fileExists(file) ){ - if ( !getDirectory()->deleteFile(file, false) ){ - if (directory->fileExists(file)) { - #ifdef _CL_DEBUG_INFO - fprintf(_CL_DEBUG_INFO,"%s; Will re-try later.\n", err.what()); - #endif - deletable.push_back(STRDUP_AtoA(file)); // add to deletable - } - } - } - ++itr; - } - } - - - - void IndexWriter::addIndexes(Directory** dirs) { - //Func - Add several indexes located in different directories into the current - // one managed by this instance - //Pre - dirs != NULL and contains directories of several indexes - // dirsLength > 0 and contains the number of directories - //Post - The indexes located in the directories in dirs have been merged with - // the pre(current) index. The Resulting index has also been optimized - - SCOPED_LOCK_MUTEX(THIS_LOCK) - - CND_PRECONDITION(dirs != NULL, "dirs is NULL"); - - // start with zero or 1 seg so optimize the current - optimize(); - - int32_t start = segmentInfos->size(); - - //Iterate through the directories - int32_t i = 0; - while ( dirs[i] != NULL ) { - // DSR: Changed SegmentInfos constructor arg (see bug discussion below). - SegmentInfos sis(false); - sis.read( dirs[i]); - - for (int32_t j = 0; j < sis.size(); j++) { - segmentInfos->add(sis.info(j)); // add each info - } - i++; - } - - // merge newly added segments in log(n) passes - while (segmentInfos->size() > start+mergeFactor) { - for (int32_t base = start; base < segmentInfos->size(); base++) { - int32_t end = min(segmentInfos->size(), base+mergeFactor); - if (end-base > 1) - mergeSegments(base, end); - } - } - - optimize(); // cleanup - } - - - void IndexWriter::addIndexes(IndexReader** readers){ - SCOPED_LOCK_MUTEX(THIS_LOCK) - optimize(); // start with zero or 1 seg - - char* mergedName = newSegmentName(); - SegmentMerger merger(this, mergedName); - - CLVector<SegmentReader*> segmentsToDelete; - SegmentReader* sReader = NULL; - if (segmentInfos->size() == 1){ // add existing index, if any - sReader = _CLNEW SegmentReader(segmentInfos->info(0)); - merger.add(sReader); - segmentsToDelete.push_back(sReader); // queue segment for deletion - } - - int32_t readersLength = 0; - while ( readers[readersLength] != NULL ) - merger.add(readers[readersLength++]); - - int32_t docCount = merger.merge(); // merge 'em - - // pop old infos & add new - segmentInfos->clearto(0); - segmentInfos->add(_CLNEW SegmentInfo(mergedName, docCount, directory)); - - if ( sReader != NULL ){ - sReader->close(); - _CLDELETE(sReader); - } - - LuceneLock* lock = directory->makeLock(IndexWriter::COMMIT_LOCK_NAME); - LockWith2 with ( lock,commitLockTimeout,this, &segmentsToDelete, true); - { - SCOPED_LOCK_MUTEX(directory->THIS_LOCK) // in- & inter-process sync - with.run(); - } - _CLDELETE(lock); - - if (useCompoundFile) { - char cmpdTmpName[CL_MAX_PATH]; - strcpy(cmpdTmpName,mergedName); - strcat(cmpdTmpName,".tmp"); - - AStringArrayWithDeletor filesToDelete; - merger.createCompoundFile(cmpdTmpName, filesToDelete); - - LuceneLock* cfslock = directory->makeLock(IndexWriter::COMMIT_LOCK_NAME); - LockWithCFS with ( lock,commitLockTimeout,directory, this, mergedName, &filesToDelete); - { - SCOPED_LOCK_MUTEX(directory->THIS_LOCK) // in- & inter-process sync - with.run(); - } - _CLDELETE(cfslock); - } - _CLDELETE_CaARRAY(mergedName); - } - - - IndexWriter::LockWith2::LockWith2(CL_NS(store)::LuceneLock* lock, int64_t lockWaitTimeout, - IndexWriter* wr, - CL_NS(util)::CLVector<SegmentReader*>* std, - bool create): - CL_NS(store)::LuceneLockWith<void>(lock,lockWaitTimeout) - { - this->writer = wr; - this->segmentsToDelete = std; - this->create = create; - } - - void IndexWriter::LockWith2::doBody() { - //Func - Writes segmentInfos to or reads segmentInfos from disk - //Pre - writer != NULL - //Post - if create is true then segementInfos has been written to disk otherwise - // segmentInfos has been read from disk - - CND_PRECONDITION(writer != NULL, "writer is NULL"); - - if (create){ - writer->segmentInfos->write(writer->getDirectory()); - if ( segmentsToDelete != NULL ) - writer->deleteSegments(segmentsToDelete); // delete now-unused segments - }else - writer->segmentInfos->read(writer->getDirectory()); - } - - IndexWriter::LockWithCFS::LockWithCFS(CL_NS(store)::LuceneLock* lock, int64_t lockWaitTimeout, - CL_NS(store)::Directory* dir, - IndexWriter* wr, - const char* segName, - CL_NS(util)::AStringArrayWithDeletor* ftd): - CL_NS(store)::LuceneLockWith<void>(lock,lockWaitTimeout) - { - this->segName = segName; - this->directory = dir; - this->writer = wr; - this->filesToDelete = ftd; - } - void IndexWriter::LockWithCFS::doBody() { - //Func - Writes segmentInfos to or reads segmentInfos from disk - //Pre - writer != NULL - //Post - if create is true then segementInfos has been written to disk otherwise - // segmentInfos has been read from disk - - CND_PRECONDITION(directory != NULL, "directory is NULL"); - CND_PRECONDITION(segName != NULL, "mergedName is NULL"); - - char from[CL_MAX_PATH]; - char nu[CL_MAX_PATH]; - - strcpy(from,segName); - strcat(from,".tmp"); - strcpy(nu,segName); - strcat(nu,".cfs"); - - // make compound file visible for SegmentReaders - directory->renameFile(from, nu); - // delete now unused files of segment - writer->deleteFiles(*filesToDelete); - } - -CL_NS_END +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "IndexWriter.h" + +#include "CLucene/document/Document.h" +#include "CLucene/store/Directory.h" +#include "CLucene/store/Lock.h" +#include "CLucene/util/VoidList.h" +#include "DocumentWriter.h" +#include "SegmentInfos.h" +#include "SegmentMerger.h" + +CL_NS_USE(store) +CL_NS_USE(util) +CL_NS_USE(document) +CL_NS_USE(analysis) +CL_NS_DEF(index) + + + const char* IndexWriter::WRITE_LOCK_NAME = "write.lock"; + const char* IndexWriter::COMMIT_LOCK_NAME = "commit.lock"; + + IndexWriter::IndexWriter(const char* path, Analyzer* a, const bool create, const bool _closeDir): + directory( FSDirectory::getDirectory(path, create) ), + analyzer(a), + segmentInfos (_CLNEW SegmentInfos), + closeDir(_closeDir){ + //Func - Constructor + // Constructs an IndexWriter for the index in path. + //Pre - path != NULL and contains a named directory path + // a holds a valid reference to an analyzer and analyzes the text to be indexed + // create indicates if the indexWriter must create a new index located at path or just open it + //Post - If create is true, then a new, empty index has been created in path, replacing the index + // already there, if any. The named directory path is owned by this Instance + + CND_PRECONDITION(path != NULL, "path is NULL"); + + //Continue initializing the instance by _IndexWriter + _IndexWriter ( create ); + } + + IndexWriter::IndexWriter(Directory* d, Analyzer* a, const bool create, const bool _closeDir): + directory(_CL_POINTER(d)), + analyzer(a), + segmentInfos (_CLNEW SegmentInfos), + closeDir(_closeDir) + { + //Func - Constructor + // Constructs an IndexWriter for the index in path. + //Pre - d contains a valid reference to a directory + // a holds a valid reference to an analyzer and analyzes the text to be indexed + // create indicates if the indexWriter must create a new index located at path or just open it + //Post - If create is true, then a new, empty index has been created in path, replacing the index + // already there, if any. The directory d is not owned by this Instance + + //Continue initializing the instance by _IndexWriter + _IndexWriter ( create ); + } + + void IndexWriter::_IndexWriter(const bool create){ + //Func - Initialises the instances + //Pre - create indicates if the indexWriter must create a new index located at path or just open it + //Post - + + similarity = CL_NS(search)::Similarity::getDefault(); + + useCompoundFile = true; + if ( directory->getDirectoryType() == RAMDirectory::DirectoryType() ) + useCompoundFile = false; + + //Create a ramDirectory + ramDirectory = _CLNEW TransactionalRAMDirectory; + + CND_CONDITION(ramDirectory != NULL,"ramDirectory is NULL"); + + //Initialize the writeLock to + writeLock = NULL; + + //initialise the settings... + maxFieldLength = DEFAULT_MAX_FIELD_LENGTH; + mergeFactor = DEFAULT_MERGE_FACTOR; + maxMergeDocs = DEFAULT_MAX_MERGE_DOCS; + writeLockTimeout = WRITE_LOCK_TIMEOUT; + commitLockTimeout = COMMIT_LOCK_TIMEOUT; + minMergeDocs = DEFAULT_MAX_BUFFERED_DOCS; + termIndexInterval = DEFAULT_TERM_INDEX_INTERVAL; + + //Create a new lock using the name "write.lock" + LuceneLock* newLock = directory->makeLock(IndexWriter::WRITE_LOCK_NAME); + + //Condition check to see if newLock has been allocated properly + CND_CONDITION(newLock != NULL, "No memory could be allocated for LuceneLock newLock"); + + //Try to obtain a write lock + if (!newLock->obtain(writeLockTimeout)){ + //Write lock could not be obtained so delete it + _CLDELETE(newLock); + //Reset the instance + _finalize(); + //throw an exception because no writelock could be created or obtained + _CLTHROWA(CL_ERR_IO, "Index locked for write or no write access." ); + } + + //The Write Lock has been obtained so save it for later use + this->writeLock = newLock; + + //Create a new lock using the name "commit.lock" + LuceneLock* lock = directory->makeLock(IndexWriter::COMMIT_LOCK_NAME); + + //Condition check to see if lock has been allocated properly + CND_CONDITION(lock != NULL, "No memory could be allocated for LuceneLock lock"); + + LockWith2 with ( lock,commitLockTimeout,this, NULL, create ); + { + SCOPED_LOCK_MUTEX(directory->THIS_LOCK) // in- & inter-process sync + with.run(); + } + + //Release the commit lock + _CLDELETE(lock); + + isOpen = true; + } + + void IndexWriter::_finalize(){ + //Func - Releases all the resources of the instance + //Pre - true + //Post - All the releases have been released + + if(writeLock != NULL){ + //release write lock + writeLock->release(); + _CLDELETE( writeLock ); + } + + //Delete the ramDirectory + if ( ramDirectory != NULL ){ + ramDirectory->close(); + _CLDECDELETE(ramDirectory); + } + + //clear segmentInfos and delete it + _CLDELETE(segmentInfos); + + } + + IndexWriter::~IndexWriter() { + //Func - Destructor + //Pre - true + //Post - The instance has been destroyed + close(); + _finalize(); + } + + + void IndexWriter::close( ) { + //Func - Flushes all changes to an index, closes all associated files, and closes + // the directory that the index is stored in. + //Pre - closeDir indicates if the directory must be closed or not + //Post - All the changes have been flushed to disk and the write lock has been released + // The ramDirectory has also been closed. The directory has been closed + // if the reference count of the directory reaches zero + + SCOPED_LOCK_MUTEX(THIS_LOCK) + if ( isOpen ){ + //Flush the Ram Segments + flushRamSegments(); + //Close the ram directory + if ( ramDirectory != NULL ){ + ramDirectory->close(); + _CLDECDELETE(ramDirectory); + } + + //Check if this instance must close the directory + if ( closeDir ){ + directory->close(); + } + _CLDECDELETE(directory); + + // release write lock + if (writeLock != NULL){ + writeLock->release(); + _CLDELETE( writeLock ); + } + + isOpen = false; + } + } + + + int32_t IndexWriter::docCount(){ + //Func - Counts the number of documents in the index + //Pre - true + //Post - The number of documents have been returned + + SCOPED_LOCK_MUTEX(THIS_LOCK) + + //Initialize count + int32_t count = 0; + + //Iterate through all segmentInfos + for (int32_t i = 0; i < segmentInfos->size(); i++) { + //Get the i-th SegmentInfo + SegmentInfo* si = segmentInfos->info(i); + //Retrieve the number of documents of the segment and add it to count + count += si->docCount; + } + return count; + } + + void IndexWriter::addDocument(Document* doc, Analyzer* analyzer) { + //Func - Adds a document to the index + //Pre - doc contains a valid reference to a document + // ramDirectory != NULL + //Post - The document has been added to the index of this IndexWriter + CND_PRECONDITION(ramDirectory != NULL,"ramDirectory is NULL"); + + if ( analyzer == NULL ) + analyzer = this->analyzer; + + ramDirectory->transStart(); + try { + char* segmentName = newSegmentName(); + CND_CONDITION(segmentName != NULL, "segmentName is NULL"); + try { + //Create the DocumentWriter using a ramDirectory and analyzer + // supplied by the IndexWriter (this). + DocumentWriter* dw = _CLNEW DocumentWriter( + ramDirectory, analyzer, this ); + CND_CONDITION(dw != NULL, "dw is NULL"); + try { + //Add the client-supplied document to the new segment. + dw->addDocument(segmentName, doc); + } _CLFINALLY( + _CLDELETE(dw); + ); + + //Create a new SegmentInfo instance about this new segment. + SegmentInfo* si = _CLNEW SegmentInfo(segmentName, 1, ramDirectory); + CND_CONDITION(si != NULL, "Si is NULL"); + + { + SCOPED_LOCK_MUTEX(THIS_LOCK) + + //Add the info object for this particular segment to the list + // of all segmentInfos-> + segmentInfos->add(si); + + //Check to see if the segments must be merged + maybeMergeSegments(); + } + } _CLFINALLY( + _CLDELETE_CaARRAY(segmentName); + ); + + } catch (...) { + ramDirectory->transAbort(); + throw; + } + ramDirectory->transCommit(); + } + + + void IndexWriter::optimize() { + //Func - Optimizes the index for which this Instance is responsible + //Pre - true + //Post - + SCOPED_LOCK_MUTEX(THIS_LOCK) + //Flush the RamSegments to disk + flushRamSegments(); + while (segmentInfos->size() > 1 || + (segmentInfos->size() == 1 && + (SegmentReader::hasDeletions(segmentInfos->info(0)) || + segmentInfos->info(0)->getDir()!=directory || + (useCompoundFile && + (!SegmentReader::usesCompoundFile(segmentInfos->info(0)) || + SegmentReader::hasSeparateNorms(segmentInfos->info(0))))))) { + + int32_t minSegment = segmentInfos->size() - mergeFactor; + + mergeSegments(minSegment < 0 ? 0 : minSegment); + } + } + + + char* IndexWriter::newSegmentName() { + SCOPED_LOCK_MUTEX(THIS_LOCK) + + TCHAR buf[9]; + _i64tot(segmentInfos->counter++,buf,36); //36 is RADIX of 10 digits and 26 numbers + + int32_t rlen = _tcslen(buf) + 2; + char* ret = _CL_NEWARRAY(char,rlen); + strcpy(ret,"_"); + STRCPY_TtoA(ret+1,buf,rlen-1); //write at 2nd character, for a maximum of 9 characters + return ret; + } + + void IndexWriter::flushRamSegments() { + //Func - Merges all RAM-resident segments. + //Pre - ramDirectory != NULL + //Post - The RAM-resident segments have been merged to disk + + CND_PRECONDITION(ramDirectory != NULL, "ramDirectory is NULL"); + + int32_t minSegment = segmentInfos->size()-1; //don't make this unsigned... + CND_CONDITION(minSegment >= -1, "minSegment must be >= -1"); + + int32_t docCount = 0; + //Iterate through all the segements and check if the directory is a ramDirectory + while (minSegment >= 0 && + segmentInfos->info(minSegment)->getDir() == ramDirectory) { + docCount += segmentInfos->info(minSegment)->docCount; + minSegment--; + } + if (minSegment < 0 || // add one FS segment? + (docCount + segmentInfos->info(minSegment)->docCount) > mergeFactor || + !(segmentInfos->info(segmentInfos->size()-1)->getDir() == ramDirectory)) + minSegment++; + + CND_CONDITION(minSegment >= 0, "minSegment must be >= 0"); + if (minSegment >= segmentInfos->size()) + return; // none to merge + mergeSegments(minSegment); + } + + void IndexWriter::maybeMergeSegments() { + //Func - Incremental Segment Merger + //Pre - + //Post - + + int64_t targetMergeDocs = minMergeDocs; + + // find segments smaller than current target size + while (targetMergeDocs <= maxMergeDocs) { + int32_t minSegment = segmentInfos->size(); + int32_t mergeDocs = 0; + + while (--minSegment >= 0) { + SegmentInfo* si = segmentInfos->info(minSegment); + if (si->docCount >= targetMergeDocs) + break; + mergeDocs += si->docCount; + } + + if (mergeDocs >= targetMergeDocs){ + // found a merge to do + mergeSegments(minSegment+1); + }else + break; + + //increase target size + targetMergeDocs *= mergeFactor; + } + } + + void IndexWriter::mergeSegments(const uint32_t minSegment) { + mergeSegments(minSegment, segmentInfos->size()); + } + + void IndexWriter::mergeSegments(const uint32_t minSegment, const uint32_t end) { + CLVector<SegmentReader*> segmentsToDelete(false); + const char* mergedName = newSegmentName(); +#ifdef _CL_DEBUG_INFO + fprintf(_CL_DEBUG_INFO, "merging segments\n"); +#endif + SegmentMerger merger(this, mergedName); + for (size_t i = minSegment; i < end; i++) { + SegmentInfo* si = segmentInfos->info(i); +#ifdef _CL_DEBUG_INFO + fprintf(_CL_DEBUG_INFO, " %s (%d docs)\n",si->name,si->docCount); +#endif + SegmentReader* reader = _CLNEW SegmentReader(si); + merger.add(reader); + if ((reader->getDirectory() == this->directory) || // if we own the directory + (reader->getDirectory() == this->ramDirectory)){ + segmentsToDelete.push_back(reader); // queue segment for deletion + } + } + + int32_t mergedDocCount = merger.merge(); + +#ifdef _CL_DEBUG_INFO + fprintf(_CL_DEBUG_INFO,"\n into %s (%d docs)\n",mergedName, mergedDocCount); +#endif + + segmentInfos->clearto(minSegment);// remove old infos & add new + segmentInfos->add( _CLNEW SegmentInfo(mergedName, mergedDocCount, directory) ); + + // close readers before we attempt to delete now-obsolete segments + merger.closeReaders(); + + LuceneLock* lock = directory->makeLock(IndexWriter::COMMIT_LOCK_NAME); + LockWith2 with ( lock, commitLockTimeout,this, &segmentsToDelete, true ); + + { + SCOPED_LOCK_MUTEX(directory->THIS_LOCK) // in- & inter-process sync + with.run(); + } + _CLDELETE( lock ); + + + + if (useCompoundFile) { + char cmpdTmpName[CL_MAX_PATH]; + strcpy(cmpdTmpName,mergedName); + strcat(cmpdTmpName,".tmp"); + + AStringArrayWithDeletor filesToDelete; + merger.createCompoundFile(cmpdTmpName, filesToDelete); + + LuceneLock* lock = directory->makeLock(IndexWriter::COMMIT_LOCK_NAME); + LockWithCFS with ( lock,commitLockTimeout,directory, this, mergedName, &filesToDelete); + { + SCOPED_LOCK_MUTEX(directory->THIS_LOCK) // in- & inter-process sync + with.run(); + } + _CLDELETE(lock); + } + + _CLDELETE_CaARRAY( mergedName ); + } + + void IndexWriter::deleteSegments(CLVector<SegmentReader*>* segments) { + AStringArrayWithDeletor deletable; + + {//scope delete deleteArray object + AStringArrayWithDeletor deleteArray; + readDeleteableFiles(deleteArray); + deleteFiles(deleteArray, deletable); // try to delete deleteable + } + + AStringArrayWithDeletor files; + for (uint32_t i = 0; i < segments->size(); i++) { + SegmentReader* reader = (*segments)[i]; + files.clear(); + reader->files(files); + if (reader->getDirectory() == this->directory) + deleteFiles(files, deletable); // try to delete our files + else + deleteFiles(files, reader->getDirectory()); // delete, eg, RAM files + } + + writeDeleteableFiles(deletable); // note files we can't delete + } + + void IndexWriter::readDeleteableFiles(AStringArrayWithDeletor& result) { + if (!directory->fileExists("deletable")) + return; + + IndexInput* input = directory->openInput("deletable"); + try { + TCHAR tname[CL_MAX_PATH]; + for (int32_t i = input->readInt(); i > 0; i--){ // read file names + input->readString(tname,CL_MAX_PATH); + result.push_back(STRDUP_TtoA(tname)); + } + } _CLFINALLY( + input->close(); + _CLDELETE(input); + ); + } + + void IndexWriter::writeDeleteableFiles(AStringArrayWithDeletor& files) { + IndexOutput* output = directory->createOutput("deleteable.new"); + try { + output->writeInt(files.size()); + TCHAR tfile[CL_MAX_PATH]; //temporary space for tchar file name + for (uint32_t i = 0; i < files.size(); i++){ + STRCPY_AtoT(tfile,files[i],CL_MAX_PATH); + output->writeString( tfile, _tcslen(tfile) ); + } + } _CLFINALLY( + output->close(); + _CLDELETE(output); + ); + + directory->renameFile("deleteable.new", "deletable"); + } + + void IndexWriter::deleteFiles(AStringArrayWithDeletor& files){ + AStringArrayWithDeletor deletable; + AStringArrayWithDeletor currDeletable; + readDeleteableFiles(currDeletable); + deleteFiles(currDeletable, deletable); // try to delete deleteable + deleteFiles(files, deletable); // try to delete our files + writeDeleteableFiles(deletable); // note files we can't delete + } + + void IndexWriter::deleteFiles(AStringArrayWithDeletor& files, Directory* directory) { + AStringArrayWithDeletor::iterator itr = files.begin(); + while ( itr != files.end() ){ + directory->deleteFile( *itr, true ); + ++itr; + } + } + + void IndexWriter::deleteFiles(AStringArrayWithDeletor& files, AStringArrayWithDeletor& deletable) { + AStringArrayWithDeletor::iterator itr=files.begin(); + while ( itr != files.end() ){ + const char* file = *itr; + if ( getDirectory()->fileExists(file) ){ + if ( !getDirectory()->deleteFile(file, false) ){ + if (directory->fileExists(file)) { + #ifdef _CL_DEBUG_INFO + fprintf(_CL_DEBUG_INFO,"%s; Will re-try later.\n", err.what()); + #endif + deletable.push_back(STRDUP_AtoA(file)); // add to deletable + } + } + } + ++itr; + } + } + + + + void IndexWriter::addIndexes(Directory** dirs) { + //Func - Add several index... [truncated message content] |