|
From: <syn...@us...> - 2009-05-02 21:26:27
|
Revision: 3004
http://clucene.svn.sourceforge.net/clucene/?rev=3004&view=rev
Author: synhershko
Date: 2009-05-02 21:26:19 +0000 (Sat, 02 May 2009)
Log Message:
-----------
Updated QueryParser to conform with JLucene 2.3.2:
* Old queryParser is still available under queryParser::legacy. It's no longer supported, and will probably be removed before the final release.
* Added NO_RESOLUTION to DateTools::Resolution to allow for NULL resolution in QP
* MultiFieldQueryParser is temporarily inavailable.
* Several v2.1+ Query classes were not ported yet, hence the QP still does not support them.
* FuzzyQuery::toString and BooleanQuery::toString were updated to conform with JL 2.3.2
* New QP might differ in syntax from the legacy one, as it completely conforms with JL 2.3.2. Examples for such differences are in the tests, where incompatible queries were commented out
* Breaking change: TokenStream::next(Token*) signature was changed - it now accepts Token*& and returns Token*. If NULL pointer is passed, a new Token object will be created. This also affects all derived classes (Filters, Tokenizers and Analyzers).
* Tests were updated to comply with the above change.
* DocumentWriter::invertDocument was also updated to comply with this change
* LUCENE_TOKEN_WORD_LENGTH macro is not supported in the current QP implementation for queryParser::Token.
Modified Paths:
--------------
branches/lucene2_3_2/src/core/CLucene/analysis/AnalysisHeader.cpp
branches/lucene2_3_2/src/core/CLucene/analysis/AnalysisHeader.h
branches/lucene2_3_2/src/core/CLucene/analysis/Analyzers.cpp
branches/lucene2_3_2/src/core/CLucene/analysis/Analyzers.h
branches/lucene2_3_2/src/core/CLucene/analysis/standard/StandardFilter.cpp
branches/lucene2_3_2/src/core/CLucene/analysis/standard/StandardFilter.h
branches/lucene2_3_2/src/core/CLucene/analysis/standard/StandardTokenizer.cpp
branches/lucene2_3_2/src/core/CLucene/analysis/standard/StandardTokenizer.h
branches/lucene2_3_2/src/core/CLucene/document/DateTools.h
branches/lucene2_3_2/src/core/CLucene/files_list.txt
branches/lucene2_3_2/src/core/CLucene/index/DocumentWriter.cpp
branches/lucene2_3_2/src/core/CLucene/queryParser/MultiFieldQueryParser.h
branches/lucene2_3_2/src/core/CLucene/queryParser/QueryParser.cpp
branches/lucene2_3_2/src/core/CLucene/queryParser/QueryParser.h
branches/lucene2_3_2/src/core/CLucene/search/BooleanQuery.cpp
branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.cpp
branches/lucene2_3_2/src/core/CLucene/search/PhraseQuery.cpp
branches/lucene2_3_2/src/core/CLucene/util/Equators.h
branches/lucene2_3_2/src/core/CLucene/util/VoidMap.h
branches/lucene2_3_2/src/core/CMakeLists.txt
branches/lucene2_3_2/src/test/CuTest.cpp
branches/lucene2_3_2/src/test/CuTest.h
branches/lucene2_3_2/src/test/analysis/TestAnalysis.cpp
branches/lucene2_3_2/src/test/analysis/TestAnalyzers.cpp
branches/lucene2_3_2/src/test/queryParser/TestMultiFieldQueryParser.cpp
branches/lucene2_3_2/src/test/queryParser/TestQueryParser.cpp
branches/lucene2_3_2/src/test/search/TestSearch.cpp
branches/lucene2_3_2/src/test/tests.cpp
Added Paths:
-----------
branches/lucene2_3_2/src/core/CLucene/queryParser/CharStream.cpp
branches/lucene2_3_2/src/core/CLucene/queryParser/FastCharStream.cpp
branches/lucene2_3_2/src/core/CLucene/queryParser/QueryParserConstants.h
branches/lucene2_3_2/src/core/CLucene/queryParser/QueryParserTokenManager.cpp
branches/lucene2_3_2/src/core/CLucene/queryParser/QueryParserTokenManager.h
branches/lucene2_3_2/src/core/CLucene/queryParser/Token.cpp
branches/lucene2_3_2/src/core/CLucene/queryParser/Token.h
branches/lucene2_3_2/src/core/CLucene/queryParser/_CharStream.h
branches/lucene2_3_2/src/core/CLucene/queryParser/_FastCharStream.h
branches/lucene2_3_2/src/core/CLucene/queryParser/legacy/
branches/lucene2_3_2/src/core/CLucene/queryParser/legacy/Lexer.cpp
branches/lucene2_3_2/src/core/CLucene/queryParser/legacy/MultiFieldQueryParser.cpp
branches/lucene2_3_2/src/core/CLucene/queryParser/legacy/MultiFieldQueryParser.h
branches/lucene2_3_2/src/core/CLucene/queryParser/legacy/QueryParser.cpp
branches/lucene2_3_2/src/core/CLucene/queryParser/legacy/QueryParser.h
branches/lucene2_3_2/src/core/CLucene/queryParser/legacy/QueryParserBase.cpp
branches/lucene2_3_2/src/core/CLucene/queryParser/legacy/QueryToken.cpp
branches/lucene2_3_2/src/core/CLucene/queryParser/legacy/QueryToken.h
branches/lucene2_3_2/src/core/CLucene/queryParser/legacy/TokenList.cpp
branches/lucene2_3_2/src/core/CLucene/queryParser/legacy/_Lexer.h
branches/lucene2_3_2/src/core/CLucene/queryParser/legacy/_TokenList.h
Removed Paths:
-------------
branches/lucene2_3_2/src/core/CLucene/queryParser/Lexer.cpp
branches/lucene2_3_2/src/core/CLucene/queryParser/QueryParserBase.cpp
branches/lucene2_3_2/src/core/CLucene/queryParser/QueryToken.cpp
branches/lucene2_3_2/src/core/CLucene/queryParser/QueryToken.h
branches/lucene2_3_2/src/core/CLucene/queryParser/TokenList.cpp
branches/lucene2_3_2/src/core/CLucene/queryParser/_Lexer.h
branches/lucene2_3_2/src/core/CLucene/queryParser/_TokenList.h
Modified: branches/lucene2_3_2/src/core/CLucene/analysis/AnalysisHeader.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/analysis/AnalysisHeader.cpp 2009-05-02 19:53:35 UTC (rev 3003)
+++ branches/lucene2_3_2/src/core/CLucene/analysis/AnalysisHeader.cpp 2009-05-02 21:26:19 UTC (rev 3004)
@@ -240,7 +240,6 @@
TokenStream::~TokenStream(){
}
-
TokenFilter::TokenFilter(TokenStream* in, bool deleteTS):
input(in),
deleteTokenStream(deleteTS)
Modified: branches/lucene2_3_2/src/core/CLucene/analysis/AnalysisHeader.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/analysis/AnalysisHeader.h 2009-05-02 19:53:35 UTC (rev 3003)
+++ branches/lucene2_3_2/src/core/CLucene/analysis/AnalysisHeader.h 2009-05-02 21:26:19 UTC (rev 3004)
@@ -186,8 +186,29 @@
*/
class CLUCENE_EXPORT TokenStream:LUCENE_BASE {
public:
- /** Sets token to the next token in the stream, returns false at the EOS. */
- virtual bool next(Token* token) = 0;
+ /** Returns the next token in the stream, or null at EOS.
+ * When possible, the input Token should be used as the
+ * returned Token (this gives fastest tokenization
+ * performance), but this is not required and a new Token
+ * may be returned (pass NULL for this).
+ * Callers may re-use a single Token instance for successive
+ * calls to this method.
+ * <p>
+ * This implicitly defines a "contract" between
+ * consumers (callers of this method) and
+ * producers (implementations of this method
+ * that are the source for tokens):
+ * <ul>
+ * <li>A consumer must fully consume the previously
+ * returned Token before calling this method again.</li>
+ * <li>A producer must call {@link Token#clear()}
+ * before setting the fields in it & returning it</li>
+ * </ul>
+ * Note that a {@link TokenFilter} is considered a consumer.
+ * @param result a Token that may or may not be used to return
+ * @return next token in the stream or null if end-of-stream was hit
+ */
+ virtual Token* next(Token*& token) = 0;
/** This is for backwards compatibility only. You should pass the token you want to fill
* to next(), this will save a lot of object construction and destructions.
@@ -205,7 +226,7 @@
* of a TokenStream are intended to be consumed more than
* once, it is necessary to implement reset().
*/
- //virtual void reset(CL_NS(util)::Reader* _input=NULL) = 0;
+ //virtual void reset(CL_NS(util)::Reader* _input=NULL){}
virtual ~TokenStream();
};
Modified: branches/lucene2_3_2/src/core/CLucene/analysis/Analyzers.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/analysis/Analyzers.cpp 2009-05-02 19:53:35 UTC (rev 3003)
+++ branches/lucene2_3_2/src/core/CLucene/analysis/Analyzers.cpp 2009-05-02 21:26:19 UTC (rev 3004)
@@ -28,7 +28,7 @@
{
return c;
}
-bool CharTokenizer::next(Token* token){
+Token* CharTokenizer::next(Token*& token){
int32_t length = 0;
int32_t start = offset;
while (true) {
@@ -36,15 +36,13 @@
offset++;
if (bufferIndex >= dataLen) {
dataLen = input->read(ioBuffer, 1, LUCENE_IO_BUFFER_SIZE );
- if (dataLen == -1)
- dataLen = 0;
bufferIndex = 0;
}
if (dataLen <= 0 ) {
if (length > 0)
break;
else
- return false;
+ return NULL;
}else
c = ioBuffer[bufferIndex++];
if (isTokenChar(c)) { // if it's a token TCHAR
@@ -62,8 +60,11 @@
}
buffer[length]=0;
- token->set( buffer, start, start+length);
- return true;
+ if (token != NULL)
+ token->set( buffer, start, start+length);
+ else
+ token = _CLNEW Token( buffer, start, start+length );
+ return token;
}
void CharTokenizer::reset(CL_NS(util)::Reader* input)
{
@@ -147,11 +148,11 @@
LowerCaseFilter::~LowerCaseFilter(){
}
-bool LowerCaseFilter::next(Token* t){
- if (!input->next(t))
- return false;
+Token* LowerCaseFilter::next(Token*& t){
+ if (input->next(t) == NULL)
+ return NULL;
stringCaseFold( t->_termText );
- return true;
+ return t;
}
bool StopFilter::ENABLE_POSITION_INCREMENTS_DEFAULT = false;
@@ -206,7 +207,7 @@
stopTable->insert( stopWords[i] );
}
-bool StopFilter::next(Token* token) {
+Token* StopFilter::next(Token*& token) {
// return the first non-stop word found
int32_t skippedPositions = 0;
while (input->next(token)){
@@ -215,13 +216,13 @@
if (enablePositionIncrements) {
token->setPositionIncrement(token->getPositionIncrement() + skippedPositions);
}
- return true;
+ return token;
}
skippedPositions += token->getPositionIncrement();
}
// reached EOS -- return nothing
- return false;
+ return NULL;
}
StopAnalyzer::StopAnalyzer(const char* stopwordsFile, const char* enc):
@@ -312,7 +313,7 @@
}
ISOLatin1AccentFilter::~ISOLatin1AccentFilter(){
}
-bool ISOLatin1AccentFilter::next(Token* token){
+Token* ISOLatin1AccentFilter::next(Token*& token){
if ( input->next(token) ){
int32_t l = token->termLength();
const TCHAR* chars = token->termBuffer();
@@ -329,7 +330,7 @@
}
if ( !doProcess ) {
- return true;
+ return token;
}
StringBuffer output(l*2);
@@ -466,9 +467,9 @@
}
}
token->setText(output.getBuffer());
- return true;
+ return token;
}
- return false;
+ return NULL;
}
@@ -498,8 +499,10 @@
KeywordTokenizer::~KeywordTokenizer(){
}
-bool KeywordTokenizer::next(Token* token){
+Token* KeywordTokenizer::next(Token*& token){
if (!done) {
+ if (token==NULL)
+ token = _CLNEW Token();
done = true;
int32_t rd;
const TCHAR* buffer=0;
@@ -517,9 +520,9 @@
}
token->_termText[token->_termTextLen]=0;
token->set(token->_termText,0,token->_termTextLen);
- return true;
+ return token;
}
- return false;
+ return NULL;
}
void KeywordTokenizer::reset(CL_NS(util)::Reader* input)
{
@@ -535,18 +538,18 @@
this->_max = _max;
}
-bool LengthFilter::next(Token* token)
+Token* LengthFilter::next(Token*& token)
{
// return the first non-stop word found
while ( input->next(token) )
{
size_t len = token->termLength();
if (len >= _min && len <= _max)
- return true;
+ return token;
// note: else we ignore it but should we index each part of it?
}
// reached EOS -- return null
- return false;
+ return NULL;
}
Modified: branches/lucene2_3_2/src/core/CLucene/analysis/Analyzers.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/analysis/Analyzers.h 2009-05-02 19:53:35 UTC (rev 3003)
+++ branches/lucene2_3_2/src/core/CLucene/analysis/Analyzers.h 2009-05-02 21:26:19 UTC (rev 3004)
@@ -38,7 +38,7 @@
public:
CharTokenizer(CL_NS(util)::Reader* in);
- bool next(Token* token);
+ Token* next(Token*& token);
void reset(CL_NS(util)::Reader* input);
virtual ~CharTokenizer();
@@ -126,7 +126,7 @@
public:
LowerCaseFilter(TokenStream* in, bool deleteTokenStream);
virtual ~LowerCaseFilter();
- bool next(Token* token);
+ Token* next(Token*& token);
};
@@ -169,7 +169,7 @@
/**
* Returns the next input Token whose termText() is not a stop word.
*/
- bool next(Token* token);
+ Token* next(Token*& token);
/**
@@ -336,7 +336,7 @@
/**
* To replace accented characters in a String by unaccented equivalents.
*/
- bool next(Token* token);
+ Token* next(Token*& token);
virtual ~ISOLatin1AccentFilter();
};
@@ -352,7 +352,7 @@
int bufferSize;
public:
KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize=-1);
- bool next(Token* token);
+ Token* next(Token*& token);
void reset(CL_NS(util)::Reader* input);
virtual ~KeywordTokenizer();
@@ -389,7 +389,7 @@
/**
* Returns the next input Token whose termText() is the right len
*/
- bool next(Token* token);
+ Token* next(Token*& token);
};
Modified: branches/lucene2_3_2/src/core/CLucene/analysis/standard/StandardFilter.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/analysis/standard/StandardFilter.cpp 2009-05-02 19:53:35 UTC (rev 3003)
+++ branches/lucene2_3_2/src/core/CLucene/analysis/standard/StandardFilter.cpp 2009-05-02 21:26:19 UTC (rev 3004)
@@ -23,9 +23,9 @@
StandardFilter::~StandardFilter(){
}
- bool StandardFilter::next(Token* t) {
- if (!input->next(t))
- return false;
+ Token* StandardFilter::next(Token*& t) {
+ if (input->next(t) == NULL)
+ return NULL;
TCHAR* text = t->_termText;
const int32_t textLength = t->termLength();
@@ -38,7 +38,7 @@
text[textLength-2]=0;
t->resetTermTextLen();
- return true;
+ return t;
} else if ( type == tokenImage[ACRONYM] ) { // remove dots
int32_t j = 0;
@@ -47,10 +47,10 @@
text[j++]=text[i];
}
text[j]=0;
- return true;
+ return t;
} else {
- return true;
+ return t;
}
}
Modified: branches/lucene2_3_2/src/core/CLucene/analysis/standard/StandardFilter.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/analysis/standard/StandardFilter.h 2009-05-02 19:53:35 UTC (rev 3003)
+++ branches/lucene2_3_2/src/core/CLucene/analysis/standard/StandardFilter.h 2009-05-02 21:26:19 UTC (rev 3004)
@@ -29,7 +29,7 @@
* <p>Removes <tt>'s</tt> from the end of words.
* <p>Removes dots from acronyms.
*/
- bool next(Token* token);
+ Token* next(Token*& token);
};
CL_NS_END2
#endif
Modified: branches/lucene2_3_2/src/core/CLucene/analysis/standard/StandardTokenizer.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/analysis/standard/StandardTokenizer.cpp 2009-05-02 19:53:35 UTC (rev 3003)
+++ branches/lucene2_3_2/src/core/CLucene/analysis/standard/StandardTokenizer.cpp 2009-05-02 21:26:19 UTC (rev 3004)
@@ -131,8 +131,13 @@
return true;
}
- bool StandardTokenizer::next(Token* t) {
+ Token* StandardTokenizer::next(Token*& t) {
int ch=0;
+
+ bool bOwnsToken = (t==NULL);
+ if (bOwnsToken)
+ t = _CLNEW Token();
+
while (!EOS) {
ch = readChar();
@@ -142,19 +147,20 @@
continue;
} else if (ALPHA || UNDERSCORE) {
tokenStart = rdPos;
- return ReadAlphaNum(ch,t);
+ if(ReadAlphaNum(ch,t))return t;
} else if (DIGIT || NEGATIVE_SIGN_ || DECIMAL) {
tokenStart = rdPos;
/* ReadNumber returns NULL if it fails to extract a valid number; in
** that case, we just continue. */
if (ReadNumber(NULL, ch,t))
- return true;
+ return t;
} else if ( _CJK ){
if ( ReadCJK(ch,t) )
- return true;
+ return t;
}
}
- return false;
+ if (bOwnsToken) _CLDELETE(t);
+ return NULL;
}
bool StandardTokenizer::ReadNumber(const TCHAR* previousNumber, const TCHAR prev,Token* t) {
Modified: branches/lucene2_3_2/src/core/CLucene/analysis/standard/StandardTokenizer.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/analysis/standard/StandardTokenizer.h 2009-05-02 19:53:35 UTC (rev 3003)
+++ branches/lucene2_3_2/src/core/CLucene/analysis/standard/StandardTokenizer.h 2009-05-02 21:26:19 UTC (rev 3004)
@@ -62,7 +62,7 @@
/** Returns the next token in the stream, or false at end-of-stream.
* The returned token's type is set to an element of
* StandardTokenizerConstants::tokenImage. */
- bool next(Token* token);
+ Token* next(Token*& token);
// Reads for number like "1"/"1234.567", or IP address like "192.168.1.2".
bool ReadNumber(const TCHAR* previousNumber, const TCHAR prev, Token* t);
Modified: branches/lucene2_3_2/src/core/CLucene/document/DateTools.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/document/DateTools.h 2009-05-02 19:53:35 UTC (rev 3003)
+++ branches/lucene2_3_2/src/core/CLucene/document/DateTools.h 2009-05-02 21:26:19 UTC (rev 3004)
@@ -14,6 +14,7 @@
public:
enum Resolution {
+ NO_RESOLUTION = NULL,
YEAR_FORMAT, // yyyy
MONTH_FORMAT, // yyyyMM
DAY_FORMAT, // yyyyMMdd
Modified: branches/lucene2_3_2/src/core/CLucene/files_list.txt
===================================================================
(Binary files differ)
Modified: branches/lucene2_3_2/src/core/CLucene/index/DocumentWriter.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/DocumentWriter.cpp 2009-05-02 19:53:35 UTC (rev 3003)
+++ branches/lucene2_3_2/src/core/CLucene/index/DocumentWriter.cpp 2009-05-02 21:26:19 UTC (rev 3004)
@@ -287,21 +287,21 @@
// Tokenize field and add to postingTable.
CL_NS(analysis)::TokenStream* stream = analyzer->tokenStream(fieldName, reader);
+ CL_NS(analysis)::Token* t = NULL;
try {
- CL_NS(analysis)::Token t;
int32_t lastTokenEndOffset = -1;
- while (stream->next(&t)) {
- position += (t.getPositionIncrement() - 1);
+ while (stream->next(t)) {
+ position += (t->getPositionIncrement() - 1);
if(field->isStoreOffsetWithTermVector()){
TermVectorOffsetInfo tio;
- tio.setStartOffset(offset + t.startOffset());
- tio.setEndOffset(offset + t.endOffset());
- addPosition(fieldName, t.termBuffer(), position++, &tio);
+ tio.setStartOffset(offset + t->startOffset());
+ tio.setEndOffset(offset + t->endOffset());
+ addPosition(fieldName, t->termBuffer(), position++, &tio);
}else
- addPosition(fieldName, t.termBuffer(), position++, NULL);
+ addPosition(fieldName, t->termBuffer(), position++, NULL);
- lastTokenEndOffset = t.endOffset();
+ lastTokenEndOffset = t->endOffset();
length++;
// Apply field truncation policy.
if (maxFieldLength != IndexWriter::FIELD_TRUNC_POLICY__WARN) {
@@ -342,7 +342,8 @@
offset += lastTokenEndOffset + 1;
} _CLFINALLY (
stream->close();
- _CLDELETE(stream);
+ _CLLDELETE(stream);
+ _CLLDELETE(t);
);
} _CLFINALLY (
if (delReader) {
Added: branches/lucene2_3_2/src/core/CLucene/queryParser/CharStream.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/queryParser/CharStream.cpp (rev 0)
+++ branches/lucene2_3_2/src/core/CLucene/queryParser/CharStream.cpp 2009-05-02 21:26:19 UTC (rev 3004)
@@ -0,0 +1,16 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "CLucene/_ApiHeader.h"
+#include "_CharStream.h"
+
+CL_NS_DEF(queryParser)
+
+~CharStream::~CharStream()
+{
+}
+
+CL_NS_END
Added: branches/lucene2_3_2/src/core/CLucene/queryParser/FastCharStream.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/queryParser/FastCharStream.cpp (rev 0)
+++ branches/lucene2_3_2/src/core/CLucene/queryParser/FastCharStream.cpp 2009-05-02 21:26:19 UTC (rev 3004)
@@ -0,0 +1,119 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "CLucene/_ApiHeader.h"
+#include "_CharStream.h"
+#include "_FastCharStream.h"
+#include "CLucene/util/CLStreams.h"
+
+CL_NS_DEF(queryParser)
+
+FastCharStream::FastCharStream(CL_NS(util)::Reader* r, bool ownsReader) : _bufferSize(0),buffer(NULL),
+ bufferLength(0),bufferPosition(0),tokenStart(0),bufferStart(0),input(r),_ownsReader(ownsReader)
+{
+}
+
+FastCharStream::~FastCharStream()
+{
+ if (_ownsReader ){
+ _CLLDELETE(input);
+ }
+ _CLDELETE_LCARRAY(buffer);
+}
+
+TCHAR FastCharStream::readChar() {
+ if (bufferPosition >= bufferLength)
+ refill();
+ return buffer[bufferPosition++];
+}
+
+void FastCharStream::refill() {
+ int32_t newPosition = bufferLength - tokenStart;
+
+ if (tokenStart == 0) { // token won't fit in buffer
+ if (buffer == NULL) { // first time: alloc buffer
+ buffer = _CL_NEWARRAY(TCHAR, 2048);
+ _bufferSize = 2048;
+ } else if (bufferLength == _bufferSize) { // grow buffer
+ _bufferSize *= 2;
+ TCHAR* newBuffer = _CL_NEWARRAY(TCHAR, _bufferSize);
+ _tcsncpy(newBuffer, buffer, bufferLength);
+ _CLDELETE_LCARRAY(buffer);
+ buffer = newBuffer;
+ }
+ } else { // shift token to front
+ _tcsncpy(buffer, buffer+tokenStart,newPosition);
+ }
+
+ bufferLength = newPosition; // update state
+ bufferPosition = newPosition;
+ bufferStart += tokenStart;
+ tokenStart = 0;
+
+ const TCHAR* charBuf = NULL;
+ int32_t charsRead = // fill space in buffer
+ input->read(charBuf, newPosition, _bufferSize-newPosition);
+ if (charsRead == -1){
+ _CLTHROWA(CL_ERR_IO, "read past eof");
+ }
+ else {
+ memcpy(buffer, charBuf, charsRead * sizeof(TCHAR)); // TODO: Can we use the reader buffer instead of copying to our own?
+ bufferLength += charsRead;
+ }
+}
+
+void FastCharStream::backup(const int32_t amount) {
+ bufferPosition -= amount;
+}
+
+TCHAR* FastCharStream::GetImage() {
+ size_t len = bufferPosition - tokenStart;
+ TCHAR* ret = _CL_NEWARRAY(TCHAR, len + 1);
+ _tcsncpy(ret, buffer+tokenStart, len);
+ ret[len] = 0; // NULL terminated string
+ return ret;
+}
+
+TCHAR* FastCharStream::GetSuffix(const int32_t len) {
+ TCHAR* value = _CL_NEWARRAY(TCHAR, len + 1);
+ _tcsncpy(value, buffer+(bufferPosition - len), len);
+ value[len] = 0; // NULL terminated string
+ return value;
+}
+
+void FastCharStream::Done() {
+ try {
+ //input->close();
+ } _CLCATCH_ERR(CL_ERR_IO, /*cleanup code*/, {
+ /*System.err.println("Caught: " + e + "; ignoring.");*/
+ })
+}
+
+TCHAR FastCharStream::BeginToken() {
+ tokenStart = bufferPosition;
+ return readChar();
+}
+
+int32_t FastCharStream::getColumn() const {
+ return bufferStart + bufferPosition;
+}
+int32_t FastCharStream::getLine() const {
+ return 1;
+}
+int32_t FastCharStream::getEndColumn() const {
+ return bufferStart + bufferPosition;
+}
+int32_t FastCharStream::getEndLine() const {
+ return 1;
+}
+int32_t FastCharStream::getBeginColumn() const {
+ return bufferStart + tokenStart;
+}
+int32_t FastCharStream::getBeginLine() const {
+ return 1;
+}
+
+CL_NS_END
Deleted: branches/lucene2_3_2/src/core/CLucene/queryParser/Lexer.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/queryParser/Lexer.cpp 2009-05-02 19:53:35 UTC (rev 3003)
+++ branches/lucene2_3_2/src/core/CLucene/queryParser/Lexer.cpp 2009-05-02 21:26:19 UTC (rev 3004)
@@ -1,371 +0,0 @@
-/*------------------------------------------------------------------------------
-* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
-*
-* Distributable under the terms of either the Apache License (Version 2.0) or
-* the GNU Lesser General Public License, as specified in the COPYING file.
-------------------------------------------------------------------------------*/
-#include "CLucene/_ApiHeader.h"
-#include "QueryParser.h"
-#include "_TokenList.h"
-#include "QueryToken.h"
-#include "_Lexer.h"
-
-#include "CLucene/util/CLStreams.h"
-#include "CLucene/util/StringBuffer.h"
-#include "CLucene/util/_FastCharStream.h"
-
-CL_NS_USE(util)
-
-CL_NS_DEF(queryParser)
-Lexer::Lexer(QueryParserBase* queryparser, const TCHAR* query) {
- //Func - Constructor
- //Pre - query != NULL and contains the query string
- //Post - An instance of Lexer has been created
-
- this->queryparser = queryparser;
-
- CND_PRECONDITION(query != NULL, "query is NULL");
-
- //The InputStream of Reader must be destroyed in the destructor
- delSR = true;
-
- StringReader *r = _CLNEW StringReader(query);
-
- //Check to see if r has been created properly
- CND_CONDITION(r != NULL, "Could not allocate memory for StringReader r");
-
- //Instantie a FastCharStream instance using r and assign it to reader
- reader = _CLNEW FastCharStream(r);
-
- //Check to see if reader has been created properly
- CND_CONDITION(reader != NULL, "Could not allocate memory for FastCharStream reader");
-
- //The InputStream of Reader must be destroyed in the destructor
- delSR = true;
-
-}
-
-
-Lexer::Lexer(QueryParserBase* queryparser, BufferedReader* source) {
- //Func - Constructor
- // Initializes a new instance of the Lexer class with the specified
- // TextReader to lex.
- //Pre - Source contains a valid reference to a Reader
- //Post - An instance of Lexer has been created using source as the reader
-
- this->queryparser = queryparser;
-
- //Instantie a FastCharStream instance using r and assign it to reader
- reader = _CLNEW FastCharStream(source);
-
- //Check to see if reader has been created properly
- CND_CONDITION(reader != NULL, "Could not allocate memory for FastCharStream reader");
-
- //The InputStream of Reader must not be destroyed in the destructor
- delSR = false;
-}
-
-
-Lexer::~Lexer() {
- //Func - Destructor
- //Pre - true
- //Post - if delSR was true the InputStream input of reader has been deleted
- // The instance of Lexer has been destroyed
-
- if (delSR) {
- _CLDELETE(reader->input);
- }
-
- _CLDELETE(reader);
-}
-
-
-void Lexer::Lex(TokenList *tokenList) {
- //Func - Breaks the input stream onto the tokens list tokens
- //Pre - tokens != NULL and contains a TokenList in which the tokens can be stored
- //Post - The tokens have been added to the TokenList tokens
-
- CND_PRECONDITION(tokenList != NULL, "tokens is NULL");
-
- //Get all the tokens
- while(true) {
- //Add the token to the tokens list
-
- //Get the next token
- QueryToken* token = _CLNEW QueryToken;
- if ( !GetNextToken(token) ){
- _CLDELETE(token);
- break;
- }
- tokenList->add(token);
- }
-
- //The end has been reached so create an EOF_ token
- //Add the final token to the TokenList _tokens
- tokenList->add(_CLNEW QueryToken( QueryToken::EOF_));
-}
-
-
-bool Lexer::GetNextToken(QueryToken* token) {
- while(!reader->Eos()) {
- int ch = reader->GetNext();
-
- if ( ch == -1 )
- break;
-
- // skipping whitespaces
- if( _istspace(ch)!=0 ) {
- continue;
- }
- TCHAR buf[2] = {ch,'\0'};
- switch(ch) {
- case '+':
- token->set(buf, QueryToken::PLUS);
- return true;
- case '-':
- token->set(buf, QueryToken::MINUS);
- return true;
- case '(':
- token->set(buf, QueryToken::LPAREN);
- return true;
- case ')':
- token->set(buf, QueryToken::RPAREN);
- return true;
- case ':':
- token->set(buf, QueryToken::COLON);
- return true;
- case '!':
- token->set(buf, QueryToken::NOT);
- return true;
- case '^':
- token->set(buf, QueryToken::CARAT);
- return true;
- case '~':
- if( _istdigit( reader->Peek() )!=0 ) {
- TCHAR number[LUCENE_MAX_FIELD_LEN];
- ReadIntegerNumber(ch, number,LUCENE_MAX_FIELD_LEN);
- token->set(number, QueryToken::SLOP);
- return true;
- }else{
- token->set(buf, QueryToken::FUZZY);
- return true;
- }
- break;
- case '"':
- return ReadQuoted(ch, token);
- case '[':
- return ReadInclusiveRange(ch, token);
- case '{':
- return ReadExclusiveRange(ch, token);
- case ']':
- case '}':
- case '*':
- queryparser->throwParserException( _T("Unrecognized char %d at %d::%d."),
- ch, reader->Column(), reader->Line() );
- return false;
- default:
- return ReadTerm(ch, token);
-
- // end of swith
- }
-
- }
- return false;
-}
-
-
-void Lexer::ReadIntegerNumber(const TCHAR ch, TCHAR* buf, int buflen) {
- int bp=0;
- buf[bp++] = ch;
-
- int c = reader->Peek();
- while( c!=-1 && _istdigit(c)!=0 && bp<buflen-1 ) {
- buf[bp++] = reader->GetNext();
- c = reader->Peek();
- }
- buf[bp++] = 0;
-}
-
-
-bool Lexer::ReadInclusiveRange(const TCHAR prev, QueryToken* token) {
- int ch = prev;
- StringBuffer range;
- range.appendChar(ch);
-
- while(!reader->Eos()) {
- ch = reader->GetNext();
- if ( ch == -1 )
- break;
- range.appendChar(ch);
-
- if(ch == ']'){
- token->set(range.getBuffer(), QueryToken::RANGEIN);
- return true;
- }
- }
- queryparser->throwParserException(_T("Unterminated inclusive range! %d %d::%d"),' ',
- reader->Column(),reader->Column());
- return false;
-}
-
-
-bool Lexer::ReadExclusiveRange(const TCHAR prev, QueryToken* token) {
- int ch = prev;
- StringBuffer range;
- range.appendChar(ch);
-
- while(!reader->Eos()) {
- ch = reader->GetNext();
-
- if (ch==-1)
- break;
- range.appendChar(ch);
-
- if(ch == '}'){
- token->set(range.getBuffer(), QueryToken::RANGEEX);
- return true;
- }
- }
- queryparser->throwParserException(_T("Unterminated exclusive range! %d %d::%d"),' ',
- reader->Column(),reader->Column() );
- return false;
-}
-
-bool Lexer::ReadQuoted(const TCHAR prev, QueryToken* token) {
- int ch = prev;
- StringBuffer quoted;
- quoted.appendChar(ch);
-
- while(!reader->Eos()) {
- ch = reader->GetNext();
-
- if (ch==-1)
- break;
-
- quoted.appendChar(ch);
-
- if(ch == '"'){
- token->set(quoted.getBuffer(), QueryToken::QUOTED);
- return true;
- }
- }
- queryparser->throwParserException(_T("Unterminated string! %d %d::%d"),' ',
- reader->Column(),reader->Column());
- return false;
-}
-
-
-bool Lexer::ReadTerm(const TCHAR prev, QueryToken* token) {
- int ch = prev;
- bool completed = false;
- int32_t asteriskCount = 0;
- bool hasQuestion = false;
-
- StringBuffer val;
- TCHAR buf[3]; //used for readescaped
-
- while(true) {
- switch(ch) {
- case -1:
- break;
- case '\\':
- {
- if ( ReadEscape(ch, buf) )
- val.append( buf );
- else
- return false;
- }
- break;
-
- case LUCENE_WILDCARDTERMENUM_WILDCARD_STRING:
- asteriskCount++;
- val.appendChar(ch);
- break;
- case LUCENE_WILDCARDTERMENUM_WILDCARD_CHAR:
- hasQuestion = true;
- val.appendChar(ch);
- break;
- case '\n':
- case '\t':
- case ' ':
- case '+':
- case '-':
- case '!':
- case '(':
- case ')':
- case ':':
- case '^':
- case '[':
- case ']':
- case '{':
- case '}':
- case '~':
- case '"':
- // create new QueryToken
- reader->UnGet();
- completed = true;
- break;
- default:
- val.appendChar(ch);
- break;
- // end of switch
- }
-
- if(completed || ch==-1 || reader->Eos() )
- break;
- else
- ch = reader->GetNext();
- }
-
- // create new QueryToken
- if(hasQuestion)
- token->set(val.getBuffer(), QueryToken::WILDTERM);
- else if(asteriskCount == 1 && val.getBuffer()[val.length() - 1] == '*')
- token->set(val.getBuffer(), QueryToken::PREFIXTERM);
- else if(asteriskCount > 0)
- token->set(val.getBuffer(), QueryToken::WILDTERM);
- else if( _tcsicmp(val.getBuffer(), _T("AND"))==0 || _tcscmp(val.getBuffer(), _T("&&"))==0 )
- token->set(val.getBuffer(), QueryToken::AND_);
- else if( _tcsicmp(val.getBuffer(), _T("OR"))==0 || _tcscmp(val.getBuffer(), _T("||"))==0)
- token->set(val.getBuffer(), QueryToken::OR);
- else if( _tcsicmp(val.getBuffer(), _T("NOT"))==0 )
- token->set(val.getBuffer(), QueryToken::NOT);
- else {
- bool isnum = true;
- int32_t nlen=val.length();
- for (int32_t i=0;i<nlen;++i) {
- TCHAR ch=val.getBuffer()[i];
- if ( _istalpha(ch) ) {
- isnum=false;
- break;
- }
- }
-
- if ( isnum )
- token->set(val.getBuffer(), QueryToken::NUMBER);
- else
- token->set(val.getBuffer(), QueryToken::TERM);
- }
- return true;
-}
-
-
-bool Lexer::ReadEscape(TCHAR prev, TCHAR* buf) {
- TCHAR ch = prev;
- int bp=0;
- buf[bp++] = ch;
-
- ch = reader->GetNext();
- int32_t idx = _tcscspn( buf, _T("\\+-!():^[]{}\"~*") );
- if(idx == 0) {
- buf[bp++] = ch;
- buf[bp++]=0;
- return true;
- }
- queryparser->throwParserException(_T("Unrecognized escape sequence at %d %d::%d"), ' ',
- reader->Column(),reader->Line());
- return false;
-}
-
-
-CL_NS_END
Modified: branches/lucene2_3_2/src/core/CLucene/queryParser/MultiFieldQueryParser.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/queryParser/MultiFieldQueryParser.h 2009-05-02 19:53:35 UTC (rev 3003)
+++ branches/lucene2_3_2/src/core/CLucene/queryParser/MultiFieldQueryParser.h 2009-05-02 21:26:19 UTC (rev 3004)
@@ -4,8 +4,8 @@
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
-#ifndef MultiFieldQueryParser_H
-#define MultiFieldQueryParser_H
+#ifndef _lucene_queryParser_MultiFieldQueryParser_
+#define _lucene_queryParser_MultiFieldQueryParser_
//#include "CLucene/analysis/AnalysisHeader.h"
Modified: branches/lucene2_3_2/src/core/CLucene/queryParser/QueryParser.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/queryParser/QueryParser.cpp 2009-05-02 19:53:35 UTC (rev 3003)
+++ branches/lucene2_3_2/src/core/CLucene/queryParser/QueryParser.cpp 2009-05-02 21:26:19 UTC (rev 3004)
@@ -5,18 +5,30 @@
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "CLucene/_ApiHeader.h"
+#include "_CharStream.h"
+#include "_FastCharStream.h"
+#include "QueryParserConstants.h"
+#include "QueryParserTokenManager.h"
#include "QueryParser.h"
#include "CLucene/analysis/AnalysisHeader.h"
-#include "CLucene/util/CLStreams.h"
+
#include "CLucene/search/SearchHeader.h"
-#include "CLucene/search/BooleanClause.h"
+
#include "CLucene/search/Query.h"
+#include "CLucene/search/TermQuery.h"
+#include "CLucene/search/BooleanQuery.h"
+#include "CLucene/search/FuzzyQuery.h"
+#include "CLucene/search/PhraseQuery.h"
+#include "CLucene/search/WildcardQuery.h"
+#include "CLucene/search/PrefixQuery.h"
+#include "CLucene/search/RangeQuery.h"
+
#include "CLucene/index/Term.h"
-#include "QueryToken.h"
+#include "Token.h"
-#include "_TokenList.h"
-#include "_Lexer.h"
+#include "CLucene/util/CLStreams.h"
+#include "CLucene/util/StringBuffer.h"
CL_NS_USE(util)
CL_NS_USE(index)
@@ -25,484 +37,1424 @@
CL_NS_DEF(queryParser)
- QueryParser::QueryParser(const TCHAR* _field, Analyzer* _analyzer) : QueryParserBase(_analyzer){
- //Func - Constructor.
- // Instantiates a QueryParser for the named field _field
- //Pre - _field != NULL
- //Post - An instance has been created
+const TCHAR* QueryParserConstants::tokenImage[] = {
+ _T("<EOF>"),
+ _T("<_NUM_CHAR>"),
+ _T("<_ESCAPED_CHAR>"),
+ _T("<_TERM_START_CHAR>"),
+ _T("<_TERM_CHAR>"),
+ _T("<_WHITESPACE>"),
+ _T("<token of kind 6>"),
+ _T("<AND>"),
+ _T("<OR>"),
+ _T("<NOT>"),
+ _T("\"+\""),
+ _T("\"-\""),
+ _T("\"(\""),
+ _T("\")\""),
+ _T("\":\""),
+ _T("\"*\""),
+ _T("\"^\""),
+ _T("<QUOTED>"),
+ _T("<TERM>"),
+ _T("<FUZZY_SLOP>"),
+ _T("<PREFIXTERM>"),
+ _T("<WILDTERM>"),
+ _T("\"[\""),
+ _T("\"{\""),
+ _T("<NUMBER>"),
+ _T("\"TO\""),
+ _T("\"]\""),
+ _T("<RANGEIN_QUOTED>"),
+ _T("<RANGEIN_GOOP>"),
+ _T("\"TO\""),
+ _T("\"}\""),
+ _T("<RANGEEX_QUOTED>"),
+ _T("<RANGEEX_GOOP>")
+};
- if ( _field )
- field = STRDUP_TtoT(_field);
- else
- field = NULL;
- tokens = NULL;
- lowercaseExpandedTerms = true;
- }
+const int32_t QueryParser::jj_la1_0[] = {0x180,0x180,0xe00,0xe00,0x1f69f80,0x48000,0x10000,0x1f69000,0x1348000,0x80000,0x80000,0x10000,0x18000000,0x2000000,0x18000000,0x10000,0x80000000,0x20000000,0x80000000,0x10000,0x80000,0x10000,0x1f68000};
+const int32_t QueryParser::jj_la1_1[] = {0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x1,0x0,0x0,0x0,0x0};
- QueryParser::~QueryParser() {
- //Func - Destructor
- //Pre - true
- //Post - The instance has been destroyed
+QueryParser::QueryParser(const TCHAR* f, Analyzer* a) : _operator(OR_OPERATOR),
+ lowercaseExpandedTerms(true),useOldRangeQuery(false),allowLeadingWildcard(false),enablePositionIncrements(false),
+ analyzer(a),field(NULL),phraseSlop(0),fuzzyMinSim(FuzzyQuery::defaultMinSimilarity),
+ fuzzyPrefixLength(FuzzyQuery::defaultPrefixLength),/*locale(NULL),*/
+ dateResolution(CL_NS(document)::DateTools::NO_RESOLUTION),fieldToDateResolution(NULL),
+ token_source(NULL),token(NULL),jj_nt(NULL),_firstToken(NULL),jj_ntk(-1),jj_scanpos(NULL),jj_lastpos(NULL),jj_la(0),
+ lookingAhead(false),jj_gen(0),jj_2_rtns(NULL),jj_rescan(false),jj_gc(0),jj_expentries(NULL),jj_expentry(NULL),
+ jj_kind(-1),jj_endpos(0)
+{
+ StringReader* rdr = _CLNEW StringReader(_T(""));
+ _init(_CLNEW FastCharStream(rdr, true));
- _CLDELETE_CARRAY(field);
+ if ( f )
+ field = STRDUP_TtoT(f);
+}
+
+void QueryParser::_deleteTokens(){
+ Token* t = _firstToken;
+ while (true){
+ if (_firstToken == NULL) break;
+ t = _firstToken->next;
+ _CLLDELETE(_firstToken);
+ _firstToken = t;
}
+}
- //static
- Query* QueryParser::parse(const TCHAR* query, const TCHAR* field, Analyzer* analyzer){
- //Func - Returns a new instance of the Query class with a specified query, field and
- // analyzer values.
- //Pre - query != NULL and holds the query to parse
- // field != NULL and holds the default field for query terms
- // analyzer holds a valid reference to an Analyzer and is used to
- // find terms in the query text
- //Post - query has been parsed and an instance of Query has been returned
+QueryParser::~QueryParser(){
+ _CLLDELETE(fieldToDateResolution);
+ _CLLDELETE(token_source);
- CND_PRECONDITION(query != NULL, "query is NULL");
- CND_PRECONDITION(field != NULL, "field is NULL");
+ _deleteTokens();
- QueryParser parser(field, analyzer);
- return parser.parse(query);
+ _CLLDELETE(jj_expentries);
+ _CLLDELETE(jj_expentry);
+ _CLLDELETE(jj_2_rtns);
+
+ _CLLDELETE(field);
+}
+
+Query* QueryParser::parse(const TCHAR* _query)
+{
+ StringReader* rdr = _CLNEW StringReader(_query);
+ ReInit(_CLNEW FastCharStream(rdr, true));
+ try {
+ // TopLevelQuery is a Query followed by the end-of-input (EOF)
+ Query* res = TopLevelQuery(field);
+ return (res!=NULL) ? res : _CLNEW BooleanQuery();
}
+ catch (CLuceneError& e) {
+ // rethrow to include the original query:
+ if (e.number()==CL_ERR_Parse || e.number()==CL_ERR_TokenMgr) {
+ TCHAR* _twhat = e.twhat();
+ const size_t errLen = _tcslen(_twhat) + _tcslen(_query) + 20; // make sure we have enough room for our error message
+ TCHAR *err = _CL_NEWARRAY(TCHAR,errLen);
+ cl_stprintf(err, errLen, _T("Cannot parse '%s': %s"), _query,_twhat);
+ _CLTHROWT_DEL(CL_ERR_Parse, err);
+ } else if (e.number()==CL_ERR_TooManyClauses) {
+ const size_t errLen = _tcslen(_query) + 25; // make sure we have enough room for our error message
+ TCHAR *err = _CL_NEWARRAY(TCHAR,errLen);
+ cl_stprintf(err, errLen, _T("Cannot parse '%s': too many boolean clauses"), _query);
+ _CLTHROWT_DEL(CL_ERR_Parse, err);
+ } else
+ throw e;
+ }
+}
- Query* QueryParser::parse(const TCHAR* query){
- //Func - Returns a parsed Query instance
- //Pre - query != NULL and contains the query value to be parsed
- //Post - Returns a parsed Query Instance
+Analyzer* QueryParser::getAnalyzer() const {
+ return analyzer;
+}
- CND_PRECONDITION(query != NULL, "query is NULL");
+TCHAR* QueryParser::getField() const {
+ return field;
+}
- //Instantie a Stringer that can read the query string
- BufferedReader* r = _CLNEW StringReader(query);
+float_t QueryParser::getFuzzyMinSim() const {
+ return fuzzyMinSim;
+}
- //Check to see if r has been created properly
- CND_CONDITION(r != NULL, "Could not allocate memory for StringReader r");
+void QueryParser::setFuzzyMinSim(const float_t _fuzzyMinSim) {
+ fuzzyMinSim = _fuzzyMinSim;
+}
- //Pointer for the return value
- Query* ret = NULL;
+int32_t QueryParser::getFuzzyPrefixLength() const {
+ return fuzzyPrefixLength;
+}
- try{
- //Parse the query managed by the StringReader R and return a parsed Query instance
- //into ret
- ret = parse(r);
- }_CLFINALLY (
- _CLDELETE(r);
- );
+void QueryParser::setFuzzyPrefixLength(const int32_t _fuzzyPrefixLength) {
+ fuzzyPrefixLength = _fuzzyPrefixLength;
+}
- return ret;
+void QueryParser::setPhraseSlop(const int32_t _phraseSlop) {
+ phraseSlop = _phraseSlop;
+}
+int32_t QueryParser::getPhraseSlop() const {
+ return phraseSlop;
+}
+void QueryParser::setAllowLeadingWildcard(const bool _allowLeadingWildcard) {
+ allowLeadingWildcard = _allowLeadingWildcard;
+}
+bool QueryParser::getAllowLeadingWildcard() const {
+ return allowLeadingWildcard;
+}
+void QueryParser::setEnablePositionIncrements(const bool _enable) {
+ enablePositionIncrements = _enable;
+}
+bool QueryParser::getEnablePositionIncrements() const {
+ return enablePositionIncrements;
+}
+void QueryParser::setDefaultOperator(Operator _op) {
+ _operator = _op;
+}
+QueryParser::Operator QueryParser::getDefaultOperator() const {
+ return _operator;
+}
+void QueryParser::setLowercaseExpandedTerms(const bool _lowercaseExpandedTerms) {
+ lowercaseExpandedTerms = _lowercaseExpandedTerms;
+}
+bool QueryParser::getLowercaseExpandedTerms() const {
+ return lowercaseExpandedTerms;
+}
+void QueryParser::setUseOldRangeQuery(const bool _useOldRangeQuery) {
+ useOldRangeQuery = _useOldRangeQuery;
+}
+bool QueryParser::getUseOldRangeQuery() const {
+ return useOldRangeQuery;
+}
+void QueryParser::setDateResolution(const CL_NS(document)::DateTools::Resolution _dateResolution) {
+ dateResolution = _dateResolution;
+}
+void QueryParser::setDateResolution(const TCHAR* fieldName, const CL_NS(document)::DateTools::Resolution _dateResolution) {
+ if (fieldName == NULL)
+ _CLTHROWA(CL_ERR_IllegalArgument, "Field cannot be null.");
+
+ if (fieldToDateResolution == NULL) {
+ // lazily initialize HashMap
+ fieldToDateResolution = _CLNEW CL_NS(util)::CLHashMap<const TCHAR*,
+ CL_NS(document)::DateTools::Resolution,
+ CL_NS(util)::Compare::TChar,
+ CL_NS(util)::Equals::TChar,
+ CL_NS(util)::Deletor::tcArray,
+ CL_NS(util)::Deletor::DummyInt32
+ >();
}
- Query* QueryParser::parse(BufferedReader* reader){
- //Func - Returns a parsed Query instance
- //Pre - reader contains a valid reference to a Reader and manages the query string
- //Post - A parsed Query instance has been returned or
+ fieldToDateResolution->put(fieldName, _dateResolution);
+}
+CL_NS(document)::DateTools::Resolution QueryParser::getDateResolution(const TCHAR* fieldName) const {
+ if (fieldName == NULL)
+ _CLTHROWA(CL_ERR_IllegalArgument,"Field cannot be null.");
- //instantiate the TokenList tokens
- TokenList _tokens;
- this->tokens = &_tokens;
+ if (fieldToDateResolution == NULL) {
+ // no field specific date resolutions set; return default date resolution instead
+ return dateResolution;
+ }
- //Instantiate a lexer
- Lexer lexer(this, reader);
+ CL_NS(document)::DateTools::Resolution resolution = fieldToDateResolution->get(fieldName);
+ if (resolution == NULL) {
+ // no date resolutions set for the given field; return default date resolution instead
+ resolution = dateResolution;
+ }
- //tokens = lexer.Lex();
- //Lex the tokens
- lexer.Lex(tokens);
+ return resolution;
+}
- //Peek to the first token and check if is an EOF
- if (tokens->peek()->Type == QueryToken::EOF_){
- // The query string failed to yield any tokens. We discard the
- // TokenList tokens and raise an exceptioin.
- QueryToken* token = this->tokens->extract();
- _CLDELETE(token);
- _CLTHROWA(CL_ERR_Parse, "No query given.");
- }
+void QueryParser::addClause(std::vector<BooleanClause*>& clauses, int32_t conj, int32_t mods, Query* q){
+ bool required, prohibited;
- //Return the parsed Query instance
- Query* ret = MatchQuery(field);
- this->tokens = NULL;
- return ret;
+ // If this term is introduced by AND, make the preceding term required,
+ // unless it's already prohibited
+ const uint32_t nPreviousClauses = clauses.size();
+ if (nPreviousClauses > 0 && conj == CONJ_AND) {
+ BooleanClause* c = clauses[nPreviousClauses-1];
+ if (!c->isProhibited())
+ c->setOccur(BooleanClause::MUST);
}
- int32_t QueryParser::MatchConjunction(){
- //Func - matches for CONJUNCTION
- // CONJUNCTION ::= <AND> | <OR>
- //Pre - tokens != NULL
- //Post - if the first token is an AND or an OR then
- // the token is extracted and deleted and CONJ_AND or CONJ_OR is returned
- // otherwise CONJ_NONE is returned
+ if (nPreviousClauses > 0 && _operator == AND_OPERATOR && conj == CONJ_OR) {
+ // If this term is introduced by OR, make the preceding term optional,
+ // unless it's prohibited (that means we leave -a OR b but +a OR b-->a OR b)
+ // notice if the input is a OR b, first term is parsed as required; without
+ // this modification a OR b would parsed as +a OR b
+ BooleanClause* c = clauses[nPreviousClauses-1];
+ if (!c->isProhibited())
+ c->setOccur(BooleanClause::SHOULD);
+ }
- CND_PRECONDITION(tokens != NULL, "tokens is NULL");
+ // We might have been passed a null query; the term might have been
+ // filtered away by the analyzer.
+ if (q == NULL)
+ return;
- switch(tokens->peek()->Type){
- case QueryToken::AND_ :
- //Delete the first token of tokenlist
- ExtractAndDeleteToken();
- return CONJ_AND;
- case QueryToken::OR :
- //Delete the first token of tokenlist
- ExtractAndDeleteToken();
- return CONJ_OR;
- default :
- return CONJ_NONE;
+ if (_operator == OR_OPERATOR) {
+ // We set REQUIRED if we're introduced by AND or +; PROHIBITED if
+ // introduced by NOT or -; make sure not to set both.
+ prohibited = (mods == MOD_NOT);
+ required = (mods == MOD_REQ);
+ if (conj == CONJ_AND && !prohibited) {
+ required = true;
}
+ } else {
+ // We set PROHIBITED if we're introduced by NOT or -; We set REQUIRED
+ // if not PROHIBITED and not introduced by OR
+ prohibited = (mods == MOD_NOT);
+ required = (!prohibited && conj != CONJ_OR);
}
+ if (required && !prohibited)
+ clauses.push_back(_CLNEW BooleanClause(q,true, BooleanClause::MUST));
+ else if (!required && !prohibited)
+ clauses.push_back(_CLNEW BooleanClause(q,true, BooleanClause::SHOULD));
+ else if (!required && prohibited)
+ clauses.push_back(_CLNEW BooleanClause(q,true, BooleanClause::MUST_NOT));
+ else {
+ _CLTHROWA(CL_ERR_Runtime, "Clause cannot be both required and prohibited");
+ }
+}
- int32_t QueryParser::MatchModifier(){
- //Func - matches for MODIFIER
- // MODIFIER ::= <PLUS> | <MINUS> | <NOT>
- //Pre - tokens != NULL
- //Post - if the first token is a PLUS the token is extracted and deleted and MOD_REQ is returned
- // if the first token is a MINUS or NOT the token is extracted and deleted and MOD_NOT is returned
- // otherwise MOD_NONE is returned
- CND_PRECONDITION(tokens != NULL, "tokens is NULL");
+Query* QueryParser::getFieldQuery(const TCHAR* _field, const TCHAR* queryText) {
+ // Use the analyzer to get all the tokens, and then build a TermQuery,
+ // PhraseQuery, or nothing based on the term count
- switch(tokens->peek()->Type){
- case QueryToken::PLUS :
- //Delete the first token of tokenlist
- ExtractAndDeleteToken();
- return MOD_REQ;
- case QueryToken::MINUS :
- case QueryToken::NOT :
- //Delete the first token of tokenlist
- ExtractAndDeleteToken();
- return MOD_NOT;
- default :
- return MOD_NONE;
+ StringReader reader(queryText);
+ TokenStream* source = analyzer->tokenStream(_field, &reader);
+
+ CLVector<CL_NS(analysis)::Token*, Deletor::Object<CL_NS(analysis)::Token> > v;
+ CL_NS(analysis)::Token* t = NULL;
+ int32_t positionCount = 0;
+ bool severalTokensAtSamePosition = false;
+
+ while (true) {
+ t = NULL;
+ try {
+ t = source->next(t);
}
+ _CLCATCH_ERR(CL_ERR_IO, _CLLDELETE(source);_CLLDELETE(t);_CLDELETE_LCARRAY(queryText);,{
+ t = NULL;
+ });
+ if (t == NULL)
+ break;
+ v.push_back(t);
+ if (t->getPositionIncrement() != 0)
+ positionCount += t->getPositionIncrement();
+ else
+ severalTokensAtSamePosition = true;
}
+ try {
+ source->close();
+ }
+ _CLCATCH_ERR(CL_ERR_IO, {_CLLDELETE(source);_CLLDELETE(t);_CLDELETE_LCARRAY(queryText);},/*ignore CL_ERR_IO */);
+ _CLLDELETE(source);
- Query* QueryParser::MatchQuery(const TCHAR* field){
- //Func - matches for QUERY
- // QUERY ::= [MODIFIER] QueryParser::CLAUSE (<CONJUNCTION> [MODIFIER] CLAUSE)*
- //Pre - field != NULL
- //Post -
+ if (v.size() == 0)
+ return NULL;
+ else if (v.size() == 1) {
+ Term* tm = _CLNEW Term(_field, v.at(0)->termBuffer());
+ Query* ret = _CLNEW TermQuery( tm );
+ _CLDECDELETE(tm);
+ return ret;
+ } else {
+ if (severalTokensAtSamePosition) {
+ if (positionCount == 1) {
+ // no phrase query:
+ BooleanQuery* q = _CLNEW BooleanQuery(true);
+ for(size_t i=0; i<v.size(); i++ ){
+ Term* tm = _CLNEW Term(_field, v.at(i)->termBuffer());
+ q->add(_CLNEW TermQuery(tm),BooleanClause::SHOULD);
+ _CLDECDELETE(tm);
+ }
+ return q;
+ }
+ else {
+ _CLDELETE_LCARRAY(queryText);
+ _CLTHROWA(CL_ERR_UnsupportedOperation, "MultiPhraseQuery NOT Implemented");
+ /*
+ // TODO: phrase query:
+ MultiPhraseQuery* mpq = _CLNEW MultiPhraseQuery();
+ mpq.setSlop(phraseSlop);
+ List multiTerms = new ArrayList();
+ int32_t position = -1;
+ for (int32_t i = 0; i < v.size(); i++) {
+ t = (org.apache.lucene.analysis.Token) v.elementAt(i);
+ if (t.getPositionIncrement() > 0 && multiTerms.size() > 0) {
+ if (enablePositionIncrements) {
+ mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
+ } else {
+ mpq.add((Term[])multiTerms.toArray(new Term[0]));
+ }
+ multiTerms.clear();
+ }
+ position += t.getPositionIncrement();
+ multiTerms.add(_CLNEW Term(field, t.termText()));
+ }
+ if (enablePositionIncrements) {
+ mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
+ } else {
+ mpq.add((Term[])multiTerms.toArray(new Term[0]));
+ }
+ return mpq;
+ */
+ }
+ }
+ else {
+ PhraseQuery* pq = _CLNEW PhraseQuery();
+ pq->setSlop(phraseSlop);
+ int32_t position = -1;
- CND_PRECONDITION(tokens != NULL, "tokens is NULL");
+ for (size_t i = 0; i < v.size(); i++) {
+ t = v.at(i);
+ Term* tm = _CLNEW Term(_field, t->termBuffer());
+ if (enablePositionIncrements) {
+ position += t->getPositionIncrement();
+ pq->add(tm,position);
+ } else {
+ pq->add(tm);
+ }
+ _CLDECDELETE(tm);
+ }
+ return pq;
+ }
+ }
+}
- vector<BooleanClause*> clauses;
+Query* QueryParser::getFieldQuery(const TCHAR* _field, const TCHAR* queryText, const int32_t slop) {
+ Query* query = getFieldQuery(_field, queryText);
- Query* q = NULL;
+ if ( query && strcmp(query->getQueryName(),PhraseQuery::getClassName()) == 0) {
+ static_cast<PhraseQuery*>(query)->setSlop(slop);
+ }
+ /*
+ // TODO: Add MultiPhraseQuery support
+ if (query instanceof MultiPhraseQuery) {
+ ((MultiPhraseQuery) query).setSlop(slop);
+ }
+ */
+ return query;
+}
- int32_t mods = MOD_NONE;
- int32_t conj = CONJ_NONE;
+Query* QueryParser::getRangeQuery(const TCHAR* _field, TCHAR* part1, TCHAR* part2, const bool inclusive)
+{
+ if (lowercaseExpandedTerms) {
+ _tcslwr(part1);
+ _tcslwr(part2);
+ }
+ /*
+ // TODO: Complete porting of the code below
+ try {
+ DateFormat df = DateFormat.getDateInstance(DateFormat.SHORT, locale);
+ df.setLenient(true);
+ Date d1 = df.parse(part1);
+ Date d2 = df.parse(part2);
+ if (inclusive) {
+ // The user can only specify the date, not the time, so make sure
+ // the time is set to the latest possible time of that date to really
+ // include all documents:
+ Calendar cal = Calendar.getInstance(locale);
+ cal.setTime(d2);
+ cal.set(Calendar.HOUR_OF_DAY, 23);
+ cal.set(Calendar.MINUTE, 59);
+ cal.set(Calendar.SECOND, 59);
+ cal.set(Calendar.MILLISECOND, 999);
+ d2 = cal.getTime();
+ }
+ CL_NS(document)::DateTools::Resolution resolution = getDateResolution(_field);
+ if (resolution == NULL) {
+ // no default or field specific date resolution has been set,
+ // use deprecated DateField to maintain compatibilty with
+ // pre-1.9 Lucene versions.
+ part1 = DateField.dateToString(d1);
+ part2 = DateField.dateToString(d2);
+ } else {
+ part1 = CL_NS(document)::DateTools::dateToString(d1, resolution);
+ part2 = CL_NS(document)::DateTools::dateToString(d2, resolution);
+ }
+ }
+ catch (...) { }
+ */
- //match for MODIFIER
- mods = MatchModifier();
+ //if(useOldRangeQuery)
+ //{
+ Term* t1 = _CLNEW Term(_field,part1);
+ Term* t2 = _CLNEW Term(_field,part2);
+ Query* ret = _CLNEW RangeQuery(t1, t2, inclusive);
+ _CLDECDELETE(t1);
+ _CLDECDELETE(t2);
+ return ret;
+ /*}
+ else
+ {
+ // TODO: Port ConstantScoreRangeQuery and enable this section
+ return _CLNEW ConstantScoreRangeQuery(_field,part1,part2,inclusive,inclusive);
+ }*/
+}
- //match for CLAUSE
- q = MatchClause(field);
- AddClause(clauses, CONJ_NONE, mods, q);
+Query* QueryParser::getBooleanQuery(std::vector<CL_NS(search)::BooleanClause*>& clauses, bool disableCoord)
+{
+ if (clauses.size()==0) {
+ return NULL; // all clause words were filtered away by the analyzer.
+ }
+ BooleanQuery* query = _CLNEW BooleanQuery(disableCoord);
- // match for CLAUSE*
- while(true){
- QueryToken* p = tokens->peek();
- if(p->Type == QueryToken::EOF_){
- QueryToken* qt = MatchQueryToken(QueryToken::EOF_);
- _CLDELETE(qt);
- break;
- }
+ for (size_t i = 0; i < clauses.size(); i++) {
+ query->add(clauses[i]);
+ }
+ return query;
+}
- if(p->Type == QueryToken::RPAREN){
- //MatchQueryToken(QueryToken::RPAREN);
- break;
- }
+Query* QueryParser::getWildcardQuery(const TCHAR* _field, TCHAR* termStr)
+{
+ if (_tcscmp(_T("*"), _field) == 0) {
+ if (_tcscmp(_T("*"), termStr) == 0) return NULL;
+ // TODO: Implement MatchAllDocsQuery
+ //return _CLNEW MatchAllDocsQuery();
+ }
+ if (!allowLeadingWildcard && (termStr[0]==_T('*') || termStr[0]==_T('?'))){
+ _CLDELETE_LCARRAY(termStr);
+ _CLTHROWT(CL_ERR_Parse,_T("'*' or '?' not allowed as first character in WildcardQuery"));
+ }
+ if (lowercaseExpandedTerms) {
+ _tcslwr(termStr);
+ }
- //match for a conjuction (AND OR NOT)
- conj = MatchConjunction();
- //match for a modifier
- mods = MatchModifier();
+ Term* t = _CLNEW Term(_field, termStr);
+ Query* q = _CLNEW WildcardQuery(t);
+ _CLDECDELETE(t);
- q = MatchClause(field);
- if ( q != NULL )
- AddClause(clauses, conj, mods, q);
- }
+ return q;
+}
- // finalize query
- if(clauses.size() == 1){ //bvk: removed this && firstQuery != NULL
- BooleanClause* c = clauses[0];
- Query* q = c->getQuery();
+Query* QueryParser::getPrefixQuery(const TCHAR* _field, TCHAR* _termStr)
+{
+ if (!allowLeadingWildcard && _termStr[0] == _T('*')){
+ _CLDELETE_LCARRAY(_termStr);
+ _CLTHROWT(CL_ERR_Parse,_T("'*' not allowed as first character in PrefixQuery"));
+ }
+ if (lowercaseExpandedTerms) {
+ _tcslwr(_termStr);
+ }
+ Term* t = _CLNEW Term(_field, _termStr);
+ Query *q = _CLNEW PrefixQuery(t);
+ _CLDECDELETE(t);
+ return q;
+}
- //Condition check to be sure clauses[0] is valid
- CND_CONDITION(c != NULL, "c is NULL");
+Query* QueryParser::getFuzzyQuery(const TCHAR* _field, TCHAR* termStr, const float_t minSimilarity)
+{
+ if (lowercaseExpandedTerms) {
+ _tcslwr(termStr);
+ }
- //Tell the boolean clause not to delete its query
- c->deleteQuery=false;
- //Clear the clauses list
- clauses.clear();
- _CLDELETE(c);
+ Term* t = _CLNEW Term(_field, termStr);
+ Query *q = _CLNEW FuzzyQuery(t, minSimilarity, fuzzyPrefixLength);
+ _CLDECDELETE(t);
+ return q;
+}
- return q;
- }else{
- return GetBooleanQuery(clauses);
- }
+TCHAR* QueryParser::discardEscapeChar(TCHAR* input, TCHAR* output) {
+ // Create char array to hold unescaped char sequence
+ const size_t inputLen = _tcslen(input);
+ bool outputOwned=false;
+ if (output == NULL){
+ output = _CL_NEWARRAY(TCHAR, inputLen + 1);
+ outputOwned=true;
}
- Query* QueryParser::MatchClause(const TCHAR* field){
- //Func - matches for CLAUSE
- // CLAUSE ::= [TERM <COLONQueryParser::>] ( TERM | (<LPAREN> QUERY <RPAREN>))
- //Pre - field != NULL
- //Post -
+ // The length of the output can be less than the input
+ // due to discarded escape chars. This variable holds
+ // the actual length of the output
+ int32_t length = 0;
- Query* q = NULL;
- const TCHAR* sfield = field;
- bool delField = false;
+ // We remember whether the last processed character was
+ // an escape character
+ bool lastCharWasEscapeChar = false;
- QueryToken *DelToken = NULL;
+ // The multiplier the current unicode digit must be multiplied with.
+ // E. g. the first digit must be multiplied with 16^3, the second with 16^2...
+ uint32_t codePointMultiplier = 0;
- //match for [TERM <COLON>]
- QueryToken* term = tokens->extract();
- if(term->Type == QueryToken::TERM && tokens->peek()->Type == QueryToken::COLON){
- DelToken = MatchQueryToken(QueryToken::COLON);
+ // Used to calculate the codepoint of the escaped unicode character
+ int32_t codePoint = 0;
- CND_CONDITION(DelToken != NULL,"DelToken is NULL");
- _CLDELETE(DelToken);
-
- TCHAR* tmp = STRDUP_TtoT(term->Value);
- discardEscapeChar(tmp);
- delField = true;
- sfield = tmp;
- _CLDELETE(term);
- }else{
- tokens->push(term);
- term = NULL;
+ for (size_t i = 0; i < in...
[truncated message content] |