|
From: <syn...@us...> - 2008-08-27 12:53:25
|
Revision: 2879
http://clucene.svn.sourceforge.net/clucene/?rev=2879&view=rev
Author: synhershko
Date: 2008-08-27 12:53:20 +0000 (Wed, 27 Aug 2008)
Log Message:
-----------
Field now uses one pointer only to hold the content of various field types. Due to this change Internal is being used less and might be removed later.
More code porting into FieldsReader
IndexInput::skipChars made protected as per JL
Introducing FieldSelector and FieldSelectorResult
Modified Paths:
--------------
branches/lucene2_3_2/src/core/CLucene/document/Field.cpp
branches/lucene2_3_2/src/core/CLucene/document/Field.h
branches/lucene2_3_2/src/core/CLucene/files_list.txt
branches/lucene2_3_2/src/core/CLucene/index/FieldsReader.cpp
branches/lucene2_3_2/src/core/CLucene/index/_FieldsReader.h
branches/lucene2_3_2/src/core/CLucene/store/IndexInput.h
branches/lucene2_3_2/src/core/CMakeLists.txt
Added Paths:
-----------
branches/lucene2_3_2/src/core/CLucene/document/FieldSelector.cpp
branches/lucene2_3_2/src/core/CLucene/document/_FieldSelector.h
Modified: branches/lucene2_3_2/src/core/CLucene/document/Field.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/document/Field.cpp 2008-08-27 08:10:53 UTC (rev 2878)
+++ branches/lucene2_3_2/src/core/CLucene/document/Field.cpp 2008-08-27 12:53:20 UTC (rev 2879)
@@ -15,32 +15,36 @@
struct Field::Internal{
const TCHAR* _name;
- TCHAR* _stringValue;
- CL_NS(util)::Reader* _readerValue;
- jstreams::StreamBase<char>* _streamValue;
+ //TCHAR* _stringValue;
+ //CL_NS(util)::Reader* _readerValue;
+ //jstreams::StreamBase<char>* _streamValue;
+ //void* fieldsData;
uint32_t config;
float_t boost;
};
Field::Field(const TCHAR* Name, Reader* reader, int config):
- internal(new Internal)
+ _internal(new Internal), lazy(false)
{
CND_PRECONDITION(Name != NULL, "Name cannot be NULL");
CND_PRECONDITION(reader != NULL, "reader cannot be NULL");
- internal->_name = CLStringIntern::intern( Name );
- internal->_stringValue = NULL;
- internal->_readerValue = reader;
- internal->_streamValue = NULL;
- internal->boost=1.0f;
+ _internal->_name = CLStringIntern::intern( Name );
+ //_internal->_stringValue = NULL;
+ //_internal->_readerValue = reader;
+ //_internal->_streamValue = NULL;
+ fieldsData = reader;
+ valueType = VALUE_READER;
+ _internal->boost=1.0f;
+
setConfig(config);
}
Field::Field(const TCHAR* Name, const TCHAR* Value, int _config):
- internal(new Internal)
+ _internal(new Internal), lazy(false)
{
CND_PRECONDITION(Name != NULL, "Name cannot be NULL");
CND_PRECONDITION(Value != NULL, "value cannot be NULL");
@@ -53,87 +57,111 @@
_CLTHROWA(CL_ERR_IllegalArgument,"cannot store term vector information for a field that is not indexed");
*/
- internal->_name = CLStringIntern::intern( Name );
- internal->_stringValue = stringDuplicate( Value );
- internal->_readerValue = NULL;
- internal->_streamValue = NULL;
- internal->boost=1.0f;
+ _internal->_name = CLStringIntern::intern( Name );
+ //_internal->_stringValue = stringDuplicate( Value );
+ //_internal->_readerValue = NULL;
+ //_internal->_streamValue = NULL;
+ fieldsData = stringDuplicate( Value );
+ valueType = VALUE_STRING;
+ _internal->boost=1.0f;
+
//config = INDEX_TOKENIZED; // default Field is tokenized and indexed
setConfig(_config);
}
Field::Field(const TCHAR* Name, jstreams::StreamBase<char>* Value, int config):
- internal(new Internal)
+ _internal(new Internal), lazy(false)
{
CND_PRECONDITION(Name != NULL, "Name cannot be NULL");
CND_PRECONDITION(Value != NULL, "value cannot be NULL");
- internal->_name = CLStringIntern::intern( Name );
- internal->_stringValue = NULL;
- internal->_readerValue = NULL;
- internal->_streamValue = Value;
- internal->boost=1.0f;
+ _internal->_name = CLStringIntern::intern( Name );
+ //_internal->_stringValue = NULL;
+ //_internal->_readerValue = NULL;
+ //_internal->_streamValue = Value;
+ fieldsData = Value;
+ valueType = VALUE_STREAM;
+ _internal->boost=1.0f;
+
setConfig(config);
}
+Field::Field(const TCHAR* Name, int config):
+ _internal(new Internal), lazy(false)
+{
+ CND_PRECONDITION(Name != NULL, "Name cannot be NULL");
+
+ _internal->_name = CLStringIntern::intern( Name );
+ fieldsData = NULL;
+ valueType = VALUE_NONE;
+
+ _internal->boost=1.0f;
+
+ setConfig(config);
+}
+
Field::~Field(){
//Func - Destructor
//Pre - true
//Post - Instance has been destroyed
- CLStringIntern::unintern(internal->_name);
+ CLStringIntern::unintern(_internal->_name);
_resetValue();
- delete internal;
+ delete _internal;
}
/*===============FIELDS=======================*/
-const TCHAR* Field::name() const { return internal->_name; } ///<returns reference
-TCHAR* Field::stringValue() const { return internal->_stringValue; } ///<returns reference
-Reader* Field::readerValue() const { return internal->_readerValue; } ///<returns reference
-jstreams::StreamBase<char>* Field::streamValue() const { return internal->_streamValue; } ///<returns reference
-CL_NS(analysis)::TokenStream* Field::tokenStreamValue() const { return NULL; }
+const TCHAR* Field::name() const { return _internal->_name; } ///<returns reference
+TCHAR* Field::stringValue() const { return (valueType & VALUE_STRING) ? static_cast<TCHAR*>(fieldsData) : NULL; } ///<returns reference
+Reader* Field::readerValue() const { return (valueType & VALUE_READER) ? static_cast<Reader*>(fieldsData) : NULL; } ///<returns reference
+jstreams::StreamBase<char>* Field::streamValue() const { return (valueType & VALUE_STREAM) ? static_cast<jstreams::StreamBase<char>*>(fieldsData) : NULL; } ///<returns reference
+CL_NS(analysis)::TokenStream* Field::tokenStreamValue() const { return (valueType & VALUE_TOKENSTREAM) ? static_cast<CL_NS(analysis)::TokenStream*>(fieldsData) : NULL; }
-bool Field::isStored() const { return (internal->config & STORE_YES) != 0; }
-bool Field::isIndexed() const { return (internal->config & INDEX_TOKENIZED)!=0 || (internal->config & INDEX_UNTOKENIZED)!=0; }
-bool Field::isTokenized() const { return (internal->config & INDEX_TOKENIZED) != 0; }
-bool Field::isCompressed() const { return (internal->config & STORE_COMPRESS) != 0; }
-bool Field::isBinary() const { return internal->_streamValue!=NULL; }
+bool Field::isStored() const { return (_internal->config & STORE_YES) != 0; }
+bool Field::isIndexed() const { return (_internal->config & INDEX_TOKENIZED)!=0 || (_internal->config & INDEX_UNTOKENIZED)!=0; }
+bool Field::isTokenized() const { return (_internal->config & INDEX_TOKENIZED) != 0; }
+bool Field::isCompressed() const { return (_internal->config & STORE_COMPRESS) != 0; }
+bool Field::isBinary() const { return (valueType & VALUE_STREAM) && fieldsData!=NULL; }
-bool Field::isTermVectorStored() const { return (internal->config & TERMVECTOR_YES) != 0; }
-bool Field::isStoreOffsetWithTermVector() const { return (internal->config & TERMVECTOR_YES) != 0 && (internal->config & TERMVECTOR_WITH_OFFSETS) != 0; }
-bool Field::isStorePositionWithTermVector() const{ return (internal->config & TERMVECTOR_YES) != 0 && (internal->config & TERMVECTOR_WITH_POSITIONS) != 0; }
+bool Field::isTermVectorStored() const { return (_internal->config & TERMVECTOR_YES) != 0; }
+bool Field::isStoreOffsetWithTermVector() const { return (_internal->config & TERMVECTOR_YES) != 0 && (_internal->config & TERMVECTOR_WITH_OFFSETS) != 0; }
+bool Field::isStorePositionWithTermVector() const{ return (_internal->config & TERMVECTOR_YES) != 0 && (_internal->config & TERMVECTOR_WITH_POSITIONS) != 0; }
-bool Field::getOmitNorms() const { return (internal->config & INDEX_NONORMS) != 0; }
-void Field::setOmitNorms(const bool omitNorms) { internal->config |= INDEX_NONORMS; }
+bool Field::getOmitNorms() const { return (_internal->config & INDEX_NONORMS) != 0; }
+void Field::setOmitNorms(const bool omitNorms) { _internal->config |= INDEX_NONORMS; }
-bool Field::isLazy() const { return (internal->config & LAZY_YES) != 0; }
+bool Field::isLazy() const { return lazy; }
void Field::setValue(const TCHAR* value) {
_resetValue();
- internal->_stringValue = stringDuplicate( value );
+ fieldsData = stringDuplicate( value );
+ valueType = VALUE_STRING;
}
void Field::setValue(CL_NS(util)::Reader* value) {
_resetValue();
- internal->_readerValue = value;
+ fieldsData = value;
+ valueType = VALUE_READER;
}
void Field::setValue(jstreams::StreamBase<char>* value) {
_resetValue();
- internal->_streamValue = value;
+ fieldsData = value;
+ valueType = VALUE_STREAM;
}
/** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
void Field::setValue(CL_NS(analysis)::TokenStream* value) {
- _resetValue();
+ //_resetValue();
//fieldsData = value;
+ //valueType = VALUE_TOKENSTREAM;
}
-void Field::setBoost(const float_t boost) { this->internal->boost = boost; }
-float_t Field::getBoost() const { return internal->boost; }
+void Field::setBoost(const float_t boost) { this->_internal->boost = boost; }
+float_t Field::getBoost() const { return _internal->boost; }
void Field::setConfig(const uint32_t x){
uint32_t newConfig=0;
@@ -200,7 +228,7 @@
}else
newConfig |= TERMVECTOR_NO;
- internal->config = newConfig;
+ _internal->config = newConfig;
}
TCHAR* Field::toString() {
@@ -252,12 +280,12 @@
result.append(name());
result.appendChar(':');
- if (! isLazy()) {
- if (internal->_stringValue != NULL)
- result.append(internal->_stringValue);
- else if ( internal->_readerValue != NULL )
+ if (! isLazy() && fieldsData != NULL) {
+ if (valueType & VALUE_STRING)
+ result.append(static_cast<const TCHAR*>(fieldsData));
+ else if (valueType & VALUE_READER)
result.append( _T("Reader") );
- else if ( internal->_streamValue != NULL )
+ else if (valueType & VALUE_STREAM)
result.append( _T("Stream") );
else
result.append( _T("NULL") );
@@ -269,9 +297,14 @@
void Field::_resetValue() {
- _CLDELETE_CARRAY(internal->_stringValue);
- _CLDELETE(internal->_readerValue);
- _CLVDELETE( internal->_streamValue );
+ if (valueType & VALUE_STRING) {
+ _CLDELETE_CARRAY(fieldsData);
+ } else if (valueType & VALUE_READER) {
+ _CLDELETE(fieldsData);
+ } else if (valueType & VALUE_STREAM) {
+ _CLVDELETE( fieldsData );
+ }
+ valueType=VALUE_NONE;
}
CL_NS_END
Modified: branches/lucene2_3_2/src/core/CLucene/document/Field.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/document/Field.h 2008-08-27 08:10:53 UTC (rev 2878)
+++ branches/lucene2_3_2/src/core/CLucene/document/Field.h 2008-08-27 12:53:20 UTC (rev 2879)
@@ -15,7 +15,6 @@
TODO: - Solve some inconsistencies between CL and JL - mainly in the constructors area.
- Write some more tests to make sure we conform with JL - mainly in the tokenizing and omitNorms area
- Is there a bug in JL when calling setOmitNorms after a Tokenized field was created?
- - TokenStream* implementation - mend all 3 pointers to one void* ?
*/
CL_CLASS_DEF(util,Reader)
@@ -41,8 +40,9 @@
text), so that they can be loaded lazily.
*/
class CLUCENE_EXPORT Field :LUCENE_BASE{
+private:
struct Internal;
- Internal* internal;
+ Internal* _internal;
public:
enum Store{
/** Store the original field value in the index. This is useful for short texts
@@ -128,11 +128,20 @@
TERMVECTOR_WITH_POSITIONS_OFFSETS = TERMVECTOR_WITH_OFFSETS | TERMVECTOR_WITH_POSITIONS
};
- enum { LAZY_YES = 4096 };
+ bool lazy;
+ enum ValueType {
+ VALUE_NONE = 0,
+ VALUE_STRING = 1,
+ VALUE_READER = 2,
+ VALUE_STREAM = 4,
+ VALUE_TOKENSTREAM = 8
+ };
+
Field(const TCHAR* name, const TCHAR* value, int _config);
Field(const TCHAR* name, CL_NS(util)::Reader* reader, int _config);
Field(const TCHAR* name, jstreams::StreamBase<char>* stream, int _config);
+ Field(const TCHAR* name, int _config); ///<No value, for lazy loading support
~Field();
/** The name of the field (e.g., "date", "subject", "title", "body", etc.)
@@ -159,16 +168,16 @@
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
CL_NS(analysis)::TokenStream* tokenStreamValue() const;
- // True iff the value of the field is to be stored in the index for return
+ // True if the value of the field is to be stored in the index for return
// with search hits. It is an error for this to be true if a field is
// Reader-valued.
bool isStored() const;
- // True iff the value of the field is to be indexed, so that it may be
+ // True if the value of the field is to be indexed, so that it may be
// searched on.
bool isIndexed() const;
- // True iff the value of the field should be tokenized as text prior to
+ // True if the value of the field should be tokenized as text prior to
// indexing. Un-tokenized fields are indexed as a single word and may not be
// Reader-valued.
bool isTokenized() const;
@@ -181,7 +190,7 @@
*/
bool isCompressed() const;
- /** True iff the term or terms used to index this field are stored as a term
+ /** True if the term or terms used to index this field are stored as a term
* vector, available from {@link IndexReader#getTermFreqVector(int32_t,TCHAR*)}.
* These methods do not provide access to the original content of the field,
* only to terms used to index it. If the original content must be
@@ -290,6 +299,9 @@
inline void setConfig(const uint32_t termVector);
inline void _resetValue();
+
+ void* fieldsData;
+ ValueType valueType;
};
CL_NS_END
#endif
Added: branches/lucene2_3_2/src/core/CLucene/document/FieldSelector.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/document/FieldSelector.cpp (rev 0)
+++ branches/lucene2_3_2/src/core/CLucene/document/FieldSelector.cpp 2008-08-27 12:53:20 UTC (rev 2879)
@@ -0,0 +1,22 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "CLucene/_ApiHeader.h"
+
+#include "_FieldSelector.h"
+
+CL_NS_DEF(document)
+
+FieldSelector::~FieldSelector(){
+}
+
+LoadFirstFieldSelector::~LoadFirstFieldSelector(){
+}
+
+FieldSelectorResult LoadFirstFieldSelector::accept(const TCHAR* fieldName) {
+ return FieldSelectorResult::LOAD_AND_BREAK;
+}
+CL_NS_END
Added: branches/lucene2_3_2/src/core/CLucene/document/_FieldSelector.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/document/_FieldSelector.h (rev 0)
+++ branches/lucene2_3_2/src/core/CLucene/document/_FieldSelector.h 2008-08-27 12:53:20 UTC (rev 2879)
@@ -0,0 +1,101 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_document_FieldSelector_
+#define _lucene_document_FieldSelector_
+
+
+CL_NS_DEF(document)
+
+/**
+ * Provides information about what should be done with this Field
+ *
+ **/
+static enum FieldSelectorResult {
+ /**
+ * Load this {@link Field} every time the {@link Document} is loaded, reading in the data as it is encounterd.
+ * {@link Document#getField(String)} and {@link Document#getFieldable(String)} should not return null.
+ *<p/>
+ * {@link Document#add(Fieldable)} should be called by the Reader.
+ */
+ LOAD = 0,
+
+ /**
+ * Lazily load this {@link Field}. This means the {@link Field} is valid, but it may not actually contain its data until
+ * invoked. {@link Document#getField(String)} SHOULD NOT BE USED. {@link Document#getFieldable(String)} is safe to use and should
+ * return a valid instance of a {@link Fieldable}.
+ *<p/>
+ * {@link Document#add(Fieldable)} should be called by the Reader.
+ */
+ LAZY_LOAD = 1,
+
+ /**
+ * Do not load the {@link Field}. {@link Document#getField(String)} and {@link Document#getFieldable(String)} should return null.
+ * {@link Document#add(Fieldable)} is not called.
+ * <p/>
+ * {@link Document#add(Fieldable)} should not be called by the Reader.
+ */
+ NO_LOAD = 2,
+
+ /**
+ * Load this field as in the {@link #LOAD} case, but immediately return from {@link Field} loading for the {@link Document}. Thus, the
+ * Document may not have its complete set of Fields. {@link Document#getField(String)} and {@link Document#getFieldable(String)} should
+ * both be valid for this {@link Field}
+ * <p/>
+ * {@link Document#add(Fieldable)} should be called by the Reader.
+ */
+ LOAD_AND_BREAK = 3,
+
+ /**
+ * Behaves much like {@link #LOAD} but does not uncompress any compressed data. This is used for internal purposes.
+ * {@link Document#getField(String)} and {@link Document#getFieldable(String)} should not return null.
+ * <p/>
+ * {@link Document#add(Fieldable)} should be called by the Reader.
+ */
+ LOAD_FOR_MERGE = 4,
+
+ /** Expert: Load the size of this {@link Field} rather than its value.
+ * Size is measured as number of bytes required to store the field == bytes for a binary or any compressed value, and 2*chars for a String value.
+ * The size is stored as a binary value, represented as an int in a byte[], with the higher order byte first in [0]
+ */
+ SIZE = 5,
+
+ /** Expert: Like {@link #SIZE} but immediately break from the field loading loop, i.e., stop loading further fields, after the size is loaded */
+ SIZE_AND_BREAK = 6
+};
+
+/**
+ * Similar to a {@link java.io.FileFilter}, the FieldSelector allows one to make decisions about
+ * what Fields get loaded on a {@link Document} by {@link org.apache.lucene.index.IndexReader#document(int,org.apache.lucene.document.FieldSelector)}
+ *
+ **/
+class FieldSelector :LUCENE_BASE {
+public:
+ ~FieldSelector();
+
+ /**
+ *
+ * @param fieldName the field to accept or reject
+ * @return an instance of {@link FieldSelectorResult}
+ * if the {@link Field} named <code>fieldName</code> should be loaded.
+ */
+ virtual FieldSelectorResult accept(const TCHAR* fieldName) = 0;
+};
+
+/**
+ * Load the First field and break.
+ * <p/>
+ * See {@link FieldSelectorResult#LOAD_AND_BREAK}
+ */
+class LoadFirstFieldSelector :FieldSelector {
+public:
+ ~LoadFirstFieldSelector();
+
+ FieldSelectorResult accept(const TCHAR* fieldName);
+};
+
+CL_NS_END
+#endif
Modified: branches/lucene2_3_2/src/core/CLucene/index/FieldsReader.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/FieldsReader.cpp 2008-08-27 08:10:53 UTC (rev 2878)
+++ branches/lucene2_3_2/src/core/CLucene/index/FieldsReader.cpp 2008-08-27 12:53:20 UTC (rev 2879)
@@ -13,7 +13,7 @@
#include "CLucene/store/Directory.h"
#include "CLucene/store/IndexInput.h"
#include "CLucene/document/Document.h"
-#include "CLucene/document/Field.h"
+#include "CLucene/document/_FieldSelector.h"
#include "_FieldInfos.h"
#include "_FieldsWriter.h"
#include "_FieldsReader.h"
@@ -60,8 +60,7 @@
indexStream = d->openInput( buf, _readBufferSize );
_CLDELETE_CaARRAY( buf );
- /*
- if (docStoreOffset != -1) {
+ if (_docStoreOffset != -1) {
// We read only a slice out of this shared fields file
this->docStoreOffset = _docStoreOffset;
this->_size = size;
@@ -73,9 +72,9 @@
} else {
this->docStoreOffset = 0;
this->_size = (int32_t) (indexStream->length() >> 3);
- }*/
+ }
- _size = (int32_t)indexStream->length()/8; //todo: remove when uncommenting block above
+ //_size = (int32_t)indexStream->length()/8;
numTotalDocs = (int32_t) (indexStream->length() >> 3);
success = true;
@@ -133,10 +132,10 @@
return _size;
}
-bool FieldsReader::doc(int32_t n, Document* doc) {
- if ( n * 8L > indexStream->length() )
+bool FieldsReader::doc(int32_t n, Document* doc, CL_NS(document)::FieldSelector* fieldSelector) {
+ if ( (n + docStoreOffset) * 8L > indexStream->length() )
return false;
- indexStream->seek(n * 8L);
+ indexStream->seek((n + docStoreOffset) * 8L);
int64_t position = indexStream->readLong();
fieldsStream->seek(position);
@@ -144,11 +143,14 @@
for (int32_t i = 0; i < numFields; i++) {
int32_t fieldNumber = fieldsStream->readVInt();
FieldInfo* fi = fieldInfos->fieldInfo(fieldNumber);
+ if ( fi == NULL ) _CLTHROWA(CL_ERR_IO, "Field stream is invalid");
- if ( fi == NULL )
- _CLTHROWA(CL_ERR_IO, "Field stream is invalid");
+ FieldSelectorResult acceptField = (fieldSelector == NULL) ? CL_NS(document)::LOAD : fieldSelector->accept(fi->name);
uint8_t bits = fieldsStream->readByte();
+ CND_CONDITION(bits <= FieldsWriter::FIELD_IS_COMPRESSED + FieldsWriter::FIELD_IS_TOKENIZED + FieldsWriter::FIELD_IS_BINARY,
+ "invalid field bits");
+
if ((bits & FieldsWriter::FIELD_IS_BINARY) != 0) {
int32_t fieldLen = fieldsStream->readVInt();
FieldsReader::FieldsStreamHolder* subStream = new FieldsReader::FieldsStreamHolder(fieldsStream, fieldLen);
@@ -215,7 +217,7 @@
fieldsStream->seek(fieldsStream->getFilePointer() + fieldLen);
}else {
TCHAR* fvalue = fieldsStream->readString();
- Field* f = _CLNEW Field(
+ Field* f = _CLNEW Field(
fi->name, // name
fvalue, // read value
bits);
@@ -229,6 +231,124 @@
return true;
}
+CL_NS(store)::IndexInput* FieldsReader::rawDocs(int32_t* lengths, const int32_t startDocID, const int32_t numDocs) {
+ indexStream->seek((docStoreOffset+startDocID) * 8L);
+ int64_t startOffset = indexStream->readLong();
+ int64_t lastOffset = startOffset;
+ int32_t count = 0;
+ while (count < numDocs) {
+ int64_t offset;
+ const int32_t docID = docStoreOffset + startDocID + count + 1;
+ CND_CONDITION( docID <= numTotalDocs, "invalid docID");
+ if (docID < numTotalDocs)
+ offset = indexStream->readLong();
+ else
+ offset = fieldsStream->length();
+ lengths[count++] = static_cast<int32_t>(offset-lastOffset);
+ lastOffset = offset;
+ }
+
+ fieldsStream->seek(startOffset);
+
+ return fieldsStream;
+}
+
+void FieldsReader::skipField(const bool binary, const bool compressed) {
+ skipField(binary, compressed, fieldsStream->readVInt());
+}
+
+void FieldsReader::skipField(const bool binary, const bool compressed, const int32_t toRead) {
+ if (binary || compressed) {
+ int64_t pointer = fieldsStream->getFilePointer();
+ fieldsStream->seek(pointer + toRead);
+ } else {
+ //We need to skip chars. This will slow us down, but still better
+ fieldsStream->skipChars(toRead);
+ }
+}
+
+void FieldsReader::addFieldLazy(CL_NS(document)::Document* doc, FieldInfo* fi, const bool binary,
+ const bool compressed, const bool tokenize) {
+ if (binary) {
+ int32_t toRead = fieldsStream->readVInt();
+ int64_t pointer = fieldsStream->getFilePointer();
+ if (compressed) {
+ //was: doc.add(new Fieldable(fi.name, uncompress(b), Fieldable.Store.COMPRESS));
+ doc->add(*_CLNEW LazyField(this, fi->name, Field::STORE_COMPRESS, toRead, pointer));
+ } else {
+ //was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES));
+ doc->add(*_CLNEW LazyField(this, fi->name, Field::STORE_YES, toRead, pointer));
+ }
+ //Need to move the pointer ahead by toRead positions
+ fieldsStream->seek(pointer + toRead);
+ } else {
+ //Field.Store store = Field.Store.YES;
+ //Field.Index index = getIndexType(fi, tokenize);
+ //Field.TermVector termVector = getTermVectorType(fi);
+
+ LazyField* f = NULL;
+ if (compressed) {
+ int32_t toRead = fieldsStream->readVInt();
+ int64_t pointer = fieldsStream->getFilePointer();
+ f = _CLNEW LazyField(this, fi->name, Field::STORE_COMPRESS, toRead, pointer);
+ //skip over the part that we aren't loading
+ fieldsStream->seek(pointer + toRead);
+ f->setOmitNorms(fi->omitNorms);
+ } else {
+ int32_t length = fieldsStream->readVInt();
+ int64_t pointer = fieldsStream->getFilePointer();
+ //Skip ahead of where we are by the length of what is stored
+ fieldsStream->skipChars(length);
+ f = _CLNEW LazyField(this, fi->name, Field::STORE_YES | getIndexType(fi, tokenize) | getTermVectorType(fi), length, pointer);
+ f->setOmitNorms(fi->omitNorms);
+ }
+ doc->add(*f);
+ }
+}
+
+/*
+// Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes)
+// Read just the size -- caller must skip the field content to continue reading fields
+// Return the size in bytes or chars, depending on field type
+int32_t FieldsReader::addFieldSize(const CL_NS(document)::Document* doc, const FieldInfo* fi, const bool binary, const bool compressed) {
+ const int32_t size = fieldsStream->readVInt();
+ const int32_t bytesize = binary || compressed ? size : 2*size;
+ uint8_t* sizebytes = _CL_NEWARRAY(byte, 4);
+ sizebytes[0] = (byte) (bytesize>>>24);
+ sizebytes[1] = (byte) (bytesize>>>16);
+ sizebytes[2] = (byte) (bytesize>>> 8);
+ sizebytes[3] = (byte) bytesize ;
+ doc->add(*_CLNEW Field(fi->name, sizebytes, Field::STORE_YES));
+ return size;
+}*/
+
+CL_NS(document)::Field::TermVector FieldsReader::getTermVectorType(const FieldInfo* fi) {
+ if (fi->storeTermVector) {
+ if (fi->storeOffsetWithTermVector) {
+ if (fi->storePositionWithTermVector) {
+ return Field::TERMVECTOR_WITH_POSITIONS_OFFSETS;
+ } else {
+ return Field::TERMVECTOR_WITH_OFFSETS;
+ }
+ } else if (fi->storePositionWithTermVector) {
+ return Field::TERMVECTOR_WITH_POSITIONS;
+ } else {
+ return Field::TERMVECTOR_YES;
+ }
+ } else {
+ return Field::TERMVECTOR_NO ;
+ }
+}
+
+CL_NS(document)::Field::Index FieldsReader::getIndexType(const FieldInfo* fi, const bool tokenize) {
+ if (fi->isIndexed && tokenize)
+ return Field::INDEX_TOKENIZED;
+ else if (fi->isIndexed && !tokenize)
+ return Field::INDEX_UNTOKENIZED;
+ else
+ return Field::INDEX_NO;
+}
+
FieldsReader::FieldsStreamHolder::FieldsStreamHolder(IndexInput* indexInput, int32_t subLength){
this->indexInput = indexInput->clone();
this->indexInputStream = new IndexInputStream(this->indexInput);
@@ -268,4 +388,98 @@
return ret;
}
+
+FieldsReader::LazyField::LazyField(FieldsReader* _parent, const TCHAR* _name,
+ int config, const int32_t _toRead, const int64_t _pointer)
+: Field(_name, config), parent(_parent) {
+ // todo: need to allow for auto setting Field::INDEX_NO | Field::TERMVECTOR_NO so only Store is required
+ this->toRead = _toRead;
+ this->pointer = _pointer;
+ lazy = true;
+}
+
+CL_NS(store)::IndexInput* FieldsReader::LazyField::getFieldStream() {
+ CL_NS(store)::IndexInput* localFieldsStream = parent->fieldsStreamTL.get();
+ if (localFieldsStream == NULL) {
+ localFieldsStream = parent->cloneableFieldsStream->clone();
+ parent->fieldsStreamTL.set(localFieldsStream);
+ }
+ return localFieldsStream;
+}
+
+uint8_t* FieldsReader::LazyField::binaryValue() {
+ parent->ensureOpen();
+ if (fieldsData == NULL) {
+ uint8_t* b = _CL_NEWARRAY(uint8_t, toRead);
+ CL_NS(store)::IndexInput* localFieldsStream = getFieldStream();
+
+ localFieldsStream->seek(pointer);
+ localFieldsStream->readBytes(b, toRead);
+ if (isCompressed()) {
+ //fieldsData = uncompress(b);
+ } else {
+ fieldsData = b;
+ }
+ valueType = VALUE_STREAM;
+ }
+ return static_cast<uint8_t*>(fieldsData); // instanceof byte[] ? (byte[]) fieldsData : null;
+}
+
+CL_NS(util)::Reader* FieldsReader::LazyField::readerValue() {
+ parent->ensureOpen();
+ return (valueType & VALUE_READER) ? static_cast<CL_NS(util)::Reader*>(fieldsData) : NULL;
+}
+
+
+CL_NS(analysis)::TokenStream* FieldsReader::LazyField::tokenStreamValue() {
+ parent->ensureOpen();
+ return (valueType & VALUE_TOKENSTREAM) ? static_cast<CL_NS(analysis)::TokenStream*>(fieldsData) : NULL;
+}
+
+
+/** The value of the field as a String, or null. If null, the Reader value,
+* binary value, or TokenStream value is used. Exactly one of stringValue(),
+* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+const TCHAR* FieldsReader::LazyField::stringValue() {
+ parent->ensureOpen();
+ if (fieldsData == NULL) {
+ CL_NS(store)::IndexInput* localFieldsStream = getFieldStream();
+ localFieldsStream->seek(pointer);
+ if (isCompressed()) {
+ uint8_t* b = _CL_NEWARRAY(uint8_t, toRead);
+ localFieldsStream->readBytes(b, toRead);
+ _resetValue();
+ //fieldsData = new String(uncompress(b), "UTF-8");
+ } else {
+ //read in chars b/c we already know the length we need to read
+ TCHAR* chars = _CL_NEWARRAY(TCHAR, toRead);
+ localFieldsStream->readChars(chars, 0, toRead);
+ _resetValue();
+ fieldsData = chars;
+ }
+ valueType = VALUE_STRING;
+ }
+ return static_cast<const TCHAR*>(fieldsData); //instanceof String ? (String) fieldsData : null;
+}
+
+int64_t FieldsReader::LazyField::getPointer() const {
+ parent->ensureOpen();
+ return pointer;
+}
+
+void FieldsReader::LazyField::setPointer(const int64_t _pointer) {
+ parent->ensureOpen();
+ this->pointer = _pointer;
+}
+
+int32_t FieldsReader::LazyField::getToRead() const {
+ parent->ensureOpen();
+ return toRead;
+}
+
+void FieldsReader::LazyField::setToRead(const int32_t _toRead) {
+ parent->ensureOpen();
+ this->toRead = _toRead;
+}
+
CL_NS_END
Modified: branches/lucene2_3_2/src/core/CLucene/index/_FieldsReader.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/_FieldsReader.h 2008-08-27 08:10:53 UTC (rev 2878)
+++ branches/lucene2_3_2/src/core/CLucene/index/_FieldsReader.h 2008-08-27 12:53:20 UTC (rev 2879)
@@ -9,20 +9,22 @@
...
[truncated message content] |
|
From: <ust...@us...> - 2009-07-08 09:54:50
|
Revision: 3013
http://clucene.svn.sourceforge.net/clucene/?rev=3013&view=rev
Author: ustramooner
Date: 2009-07-08 09:54:46 +0000 (Wed, 08 Jul 2009)
Log Message:
-----------
Updated FuzzyQuery to conform with JL, should work better now. Old code similarity code - which looks to be optimized but wasn't working properly, is commented out for future optimizations reference
Modified Paths:
--------------
branches/lucene2_3_2/src/core/CLucene/CLConfig.h
branches/lucene2_3_2/src/core/CLucene/search/FilteredTermEnum.cpp
branches/lucene2_3_2/src/core/CLucene/search/FilteredTermEnum.h
branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.cpp
branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.h
branches/lucene2_3_2/src/core/CLucene/search/_PhraseQueue.h
branches/lucene2_3_2/src/core/CLucene/util/PriorityQueue.h
branches/lucene2_3_2/src/core/files_list.txt
Modified: branches/lucene2_3_2/src/core/CLucene/CLConfig.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/CLConfig.h 2009-07-08 09:53:49 UTC (rev 3012)
+++ branches/lucene2_3_2/src/core/CLucene/CLConfig.h 2009-07-08 09:54:46 UTC (rev 3013)
@@ -202,5 +202,19 @@
//
////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////
+// FuzzyQuery settings
+////////////////////////////////////////////////////////////////////
+//
+// This should be somewhere around the average long word.
+// If it is longer, we waste time and space. If it is shorter, we waste a
+// little bit of time growing the array as we encounter longer words.
+//
+#define LUCENE_TYPICAL_LONGEST_WORD_IN_INDEX 19
+//
+////////////////////////////////////////////////////////////////////
+
+
#endif
Modified: branches/lucene2_3_2/src/core/CLucene/search/FilteredTermEnum.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/search/FilteredTermEnum.cpp 2009-07-08 09:53:49 UTC (rev 3012)
+++ branches/lucene2_3_2/src/core/CLucene/search/FilteredTermEnum.cpp 2009-07-08 09:54:46 UTC (rev 3013)
@@ -12,14 +12,8 @@
CL_NS_DEF(search)
- FilteredTermEnum::FilteredTermEnum(){
- //Func - Constructor
- //Pre - true
- //Post - Instance has been created
-
- currentTerm = NULL;
- actualEnum = NULL;
- }
+FilteredTermEnum::FilteredTermEnum():currentTerm(NULL),actualEnum(NULL){
+}
FilteredTermEnum::~FilteredTermEnum() {
//Func - Destructor
@@ -48,7 +42,7 @@
//The actual enumerator is not initialized!
if (actualEnum == NULL){
return false;
- }
+ }
//Finalize the currentTerm and reset it to NULL
_CLDECDELETE( currentTerm );
@@ -101,12 +95,11 @@
//Check if actualEnum is valid
if (actualEnum){
//Close the enumeration
- actualEnum->close();
- }
+ actualEnum->close();
+ //Destroy the enumeration
+ _CLDELETE(actualEnum);
+ }
- //Destroy the enumeration
- _CLDELETE(actualEnum);
-
//Destroy currentTerm
_CLDECDELETE(currentTerm);
}
@@ -118,8 +111,7 @@
CND_PRECONDITION(actualEnum != NULL,"actualEnum is NULL");
- _CLDELETE(this->actualEnum);
-
+ _CLLDELETE(this->actualEnum);
this->actualEnum = actualEnum;
// Find the first term that matches
Modified: branches/lucene2_3_2/src/core/CLucene/search/FilteredTermEnum.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/search/FilteredTermEnum.h 2009-07-08 09:53:49 UTC (rev 3012)
+++ branches/lucene2_3_2/src/core/CLucene/search/FilteredTermEnum.h 2009-07-08 09:54:46 UTC (rev 3013)
@@ -12,47 +12,48 @@
#include "CLucene/index/Terms.h"
CL_NS_DEF(search)
- //FilteredTermEnum is an abstract class for enumerating a subset of all terms.
- //
- //Term enumerations are always ordered by term->compareTo(). Each term in
- //the enumeration is greater than all that precede it.
-
- class CLUCENE_EXPORT FilteredTermEnum: public CL_NS(index)::TermEnum {
- public:
- //Constructor
- FilteredTermEnum();
- //Destructor
- virtual ~FilteredTermEnum();
-
- //Equality measure on the term
- virtual float_t difference() = 0;
+/** Abstract class for enumerating a subset of all terms.
- //Returns the docFreq of the current Term in the enumeration.
- int32_t docFreq() const ;
-
- //Increments the enumeration to the next element
- bool next() ;
-
- //Returns a pointer to the current Term in the enumeration.
- CL_NS(index)::Term* term();
- CL_NS(index)::Term* term(bool pointer);
-
- //Closes the enumeration to further activity, freeing resources.
- void close();
+<p>Term enumerations are always ordered by Term.compareTo(). Each term in
+the enumeration is greater than all that precede it. */
+class CLUCENE_EXPORT FilteredTermEnum: public CL_NS(index)::TermEnum {
+public:
+ FilteredTermEnum();
+ virtual ~FilteredTermEnum();
- protected:
- //Equality compare on the term */
- virtual bool termCompare(CL_NS(index)::Term* term) = 0;
-
- //Indiciates the end of the enumeration has been reached
- virtual bool endEnum() = 0;
-
- void setEnum(CL_NS(index)::TermEnum* actualEnum) ;
-
- private:
- CL_NS(index)::Term* currentTerm;
- CL_NS(index)::TermEnum* actualEnum;
-
- };
+ /** Equality measure on the term */
+ virtual float_t difference() = 0;
+
+ /**
+ * Returns the docFreq of the current Term in the enumeration.
+ * Returns -1 if no Term matches or all terms have been enumerated.
+ */
+ int32_t docFreq() const;
+
+ /** Increments the enumeration to the next element. True if one exists. */
+ bool next() ;
+
+ /** Returns the current Term in the enumeration.
+ * Returns null if no Term matches or all terms have been enumerated. */
+ CL_NS(index)::Term* term(bool pointer);
+ CL_NS(index)::Term* term();
+
+ /** Closes the enumeration to further activity, freeing resources. */
+ void close();
+
+protected:
+ /** Equality compare on the term */
+ virtual bool termCompare(CL_NS(index)::Term* term) = 0;
+
+ /** Indicates the end of the enumeration has been reached */
+ virtual bool endEnum() = 0;
+
+ void setEnum(CL_NS(index)::TermEnum* actualEnum) ;
+
+private:
+ CL_NS(index)::Term* currentTerm;
+ CL_NS(index)::TermEnum* actualEnum;
+
+};
CL_NS_END
#endif
Modified: branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.cpp 2009-07-08 09:53:49 UTC (rev 3012)
+++ branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.cpp 2009-07-08 09:54:46 UTC (rev 3013)
@@ -8,366 +8,459 @@
#include "CLucene/index/Term.h"
#include "CLucene/index/IndexReader.h"
#include "Similarity.h"
-#include "CLucene/util/StringBuffer.h"
#include "FuzzyQuery.h"
+#include "BooleanQuery.h"
+#include "BooleanClause.h"
+#include "TermQuery.h"
+#include "CLucene/util/StringBuffer.h"
+#include "CLucene/util/PriorityQueue.h"
+
CL_NS_USE(index)
CL_NS_USE(util)
CL_NS_DEF(search)
- /** Finds and returns the smallest of three integers
- precondition: Must define int32_t __t for temporary storage and result
- */
- #define min3(a, b, c) __t = (a < b) ? a : b; __t = (__t < c) ? __t : c;
+/** Finds and returns the smallest of three integers
+ * precondition: Must define int32_t __t for temporary storage and result
+ */
+#define min3(a, b, c) __t = (a < b) ? a : b; __t = (__t < c) ? __t : c;
- /**
- * Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
- * length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity >
- * <code>minSimilarity</code>.
- *
- * @param reader Delivers terms.
- * @param term Pattern term.
- * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f.
- * @param prefixLength Length of required common prefix. Default value is 0.
- * @throws IOException
- */
- FuzzyTermEnum::FuzzyTermEnum(IndexReader* reader, Term* term, float_t minSimilarity, size_t prefixLength):
- distance(0),
- _endEnum(false),
- prefix(STRDUP_TtoT(LUCENE_BLANK_STRING)),
- prefixLength(0),
+ FuzzyTermEnum::FuzzyTermEnum(IndexReader* reader, Term* term, float_t minSimilarity, size_t _prefixLength):
+ FilteredTermEnum(),d(NULL),dWidth(0),dHeight(0),_similarity(0),_endEnum(false),searchTerm(_CL_POINTER(term)),
+ text(NULL),textLen(0),prefix(NULL)/* ISH: was STRDUP_TtoT(LUCENE_BLANK_STRING)*/,prefixLength(_prefixLength),
minimumSimilarity(minSimilarity)
{
- //Func - Constructor
- //Pre - reader contains a valid reference to an IndexReader
- // term != NULL
- //Post - The instance has been created
+ CND_PRECONDITION(term != NULL,"term is NULL");
- CND_PRECONDITION(term != NULL,"term is NULL");
-
- scale_factor = 1.0f / (1.0f - minimumSimilarity);
- searchTerm = _CL_POINTER(term);
-
- text = STRDUP_TtoT(term->text());
- textLen = term->textLength();
-
-
+ if (minSimilarity >= 1.0f)
+ _CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity cannot be greater than or equal to 1");
+ else if (minSimilarity < 0.0f)
+ _CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity cannot be less than 0");
+ if(_prefixLength < 0)
+ _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength cannot be less than 0");
+
+ scale_factor = 1.0f / (1.0f - minimumSimilarity); // only now we are safe from a division by zero
+ //TODO: this.field = searchTerm.field();
+
+ //The prefix could be longer than the word.
+ //It's kind of silly though. It means we must match the entire word.
+ const size_t fullSearchTermLength = searchTerm->textLength();
+ const size_t realPrefixLength = prefixLength > fullSearchTermLength ? fullSearchTermLength : prefixLength;
+
+ text = STRDUP_TtoT(searchTerm->text() + realPrefixLength);
+ textLen = fullSearchTermLength - realPrefixLength;
+
+ // TODO: what is safer to use, prefixLength or realPrefixLength?
+ prefix = _CL_NEWARRAY(TCHAR,realPrefixLength+1);
+ _tcsncpy(prefix, searchTerm->text(), realPrefixLength);
+ prefix[realPrefixLength]='\0';
+
+ initializeMaxDistances();
+ dWidth = LUCENE_TYPICAL_LONGEST_WORD_IN_INDEX; // default length of the d array
+ dHeight = textLen + 1;
+
+ Term* trm = _CLNEW Term(searchTerm->field(), prefix, true); // _CLNEW Term(term, prefix); -- not intern'd?
+ setEnum(reader->terms(trm));
+ _CLDECDELETE(trm);
+
+
+ /* LEGACY:
//Initialize e to NULL
e = NULL;
eWidth = 0;
eHeight = 0;
-
+
if(prefixLength > 0 && prefixLength < textLen){
- this->prefixLength = prefixLength;
-
- prefix = _CL_NEWARRAY(TCHAR,prefixLength+1);
- _tcsncpy(prefix,text,prefixLength);
- prefix[prefixLength]='\0';
-
- textLen = prefixLength;
- text[textLen]='\0';
- }
-
-
- //Set the enumeration
- Term* trm = _CLNEW Term(term, prefix);
- setEnum(reader->terms(trm));
- _CLDECDELETE(trm);
- }
+ this->prefixLength = prefixLength;
- FuzzyTermEnum::~FuzzyTermEnum(){
- //Func - Destructor
- //Pre - true
- //Post - FuzzyTermEnum has been destroyed
+ prefix = _CL_NEWARRAY(TCHAR,prefixLength+1);
+ _tcsncpy(prefix,text,prefixLength);
+ prefix[prefixLength]='\0';
- //Close the enumeration
- close();
- }
+ textLen = prefixLength;
+ text[textLen]='\0';
+ }
+ */
+ }
+
+ FuzzyTermEnum::~FuzzyTermEnum(){
+ close();
+ }
const char* FuzzyTermEnum::getObjectName() const{ return getClassName(); }
const char* FuzzyTermEnum::getClassName(){ return "FuzzyTermEnum"; }
- bool FuzzyTermEnum::endEnum() {
- //Func - Returns the fact if the current term in the enumeration has reached the end
- //Pre - true
- //Post - The boolean value of endEnum has been returned
+ bool FuzzyTermEnum::endEnum() {
+ return _endEnum;
+ }
- return _endEnum;
- }
+ void FuzzyTermEnum::close(){
- void FuzzyTermEnum::close(){
- //Func - Close the enumeration
- //Pre - true
- //Post - The enumeration has been closed
+ FilteredTermEnum::close();
- FilteredTermEnum::close();
-
- //Finalize the searchTerm
- _CLDECDELETE(searchTerm);
- //Destroy e
- _CLDELETE_ARRAY(e);
+ //Finalize the searchTerm
+ _CLDECDELETE(searchTerm);
- _CLDELETE_CARRAY(text);
+ free(d);
+ d=NULL;
- _CLDELETE_CARRAY(prefix);
- }
+ _CLDELETE_CARRAY(text);
- bool FuzzyTermEnum::termCompare(Term* term) {
- //Func - Compares term with the searchTerm using the Levenshtein distance.
- //Pre - term is NULL or term points to a Term
- //Post - if pre(term) is NULL then false is returned otherwise
- // if the distance of the current term in the enumeration is bigger than the FUZZY_THRESHOLD
- // then true is returned
-
- if (term == NULL){
- return false; //Note that endEnum is not set to true!
- }
+ _CLDELETE_CARRAY(prefix);
+ }
- const TCHAR* termText = term->text();
- size_t termTextLen = term->textLength();
+ bool FuzzyTermEnum::termCompare(Term* term) {
+ //Func - Compares term with the searchTerm using the Levenshtein distance.
+ //Pre - term is NULL or term points to a Term
+ //Post - if pre(term) is NULL then false is returned otherwise
+ // if the distance of the current term in the enumeration is bigger than the FUZZY_THRESHOLD
+ // then true is returned
- //Check if the field name of searchTerm of term match
- //(we can use == because fields are interned)
- if ( searchTerm->field() == term->field() &&
- (prefixLength==0 || _tcsncmp(termText,prefix,prefixLength)==0 )) {
+ if (term == NULL){
+ return false; //Note that endEnum is not set to true!
+ }
- const TCHAR* target = termText+prefixLength;
- size_t targetLen = termTextLen-prefixLength;
+ const TCHAR* termText = term->text();
+ const size_t termTextLen = term->textLength();
- //Calculate the Levenshtein distance
- int32_t dist = editDistance(text, target, textLen, targetLen);
- distance = 1 - ((float_t)dist / (float_t)cl_min(textLen, targetLen));
- return (distance > minimumSimilarity);
- }
+ //Check if the field name of searchTerm of term match
+ //(we can use == because fields are interned)
+ if ( searchTerm->field() == term->field() &&
+ (prefixLength==0 || _tcsncmp(termText,prefix,prefixLength)==0 )) {
+
+ const TCHAR* target = termText+prefixLength;
+ const size_t targetLen = termTextLen-prefixLength;
+ _similarity = similarity(target, targetLen);
+ return (_similarity > minimumSimilarity);
+
+ /* LEGACY:
+ //Calculate the Levenshtein distance
+ int32_t dist = editDistance(text, target, textLen, targetLen);
+ distance = 1 - ((float_t)dist / (float_t)cl_min(textLen, targetLen));
+ return (distance > minimumSimilarity);
+ */
+ }
_endEnum = true;
return false;
- }
+ }
- float_t FuzzyTermEnum::difference() {
- //Func - Returns the difference between the distance and the fuzzy threshold
- // multiplied by the scale factor
- //Pre - true
- //Post - The difference is returned
+ float_t FuzzyTermEnum::difference() {
+ return (float_t)((_similarity - minimumSimilarity) * scale_factor );
+ }
- return (float_t)((distance - minimumSimilarity) * scale_factor );
- }
+ // TODO: had synchronized in definition
+ float_t FuzzyTermEnum::similarity(const TCHAR* target, const size_t m) {
+ const size_t n = textLen; // TODO: remove after replacing n with textLen
+ if (n == 0) {
+ //we don't have anything to compare. That means if we just add
+ //the letters for m we get the new word
+ return prefixLength == 0 ? 0.0f : 1.0f - ((float_t) m / prefixLength);
+ }
+ if (m == 0) {
+ return prefixLength == 0 ? 0.0f : 1.0f - ((float_t) n / prefixLength);
+ }
+
+ const int32_t maxDistance = getMaxDistance(m);
+
+ if (maxDistance < abs((int32_t)(m-n))) {
+ //just adding the characters of m to n or vice-versa results in
+ //too many edits
+ //for example "pre" length is 3 and "prefixes" length is 8. We can see that
+ //given this optimal circumstance, the edit distance cannot be less than 5.
+ //which is 8-3 or more precisesly Math.abs(3-8).
+ //if our maximum edit distance is 4, then we can discard this word
+ //without looking at it.
+ return 0.0f;
+ }
+
+ //let's make sure we have enough room in our array to do the distance calculations.
+ //Check if the array must be reallocated because it is too small or does not exist
+
+ // TODO: realloc should be able to allocate memory for NULL pointers; if thats the case the NULL
+ // check here is redundant
+ if (d == NULL){
+ dWidth = cl_max(dWidth, n+1);
+ dHeight = cl_max(dHeight, m+1);
+ d = reinterpret_cast<int32_t*>(malloc(sizeof(int32_t)*dWidth*dHeight));
+ } else if (dWidth <= n || dHeight <= m) {
+ //growDistanceArray
+ dWidth = cl_max(dWidth, n+1);
+ dHeight = cl_max(dHeight, m+1);
+ d = reinterpret_cast<int32_t*>(realloc(d, sizeof(int32_t)*dWidth*dHeight));
+ }
+
+ size_t i; // iterates through the source string
+ size_t j; // iterates through the target string
+
+ // init matrix d
+ for (i = 0; i <= n; i++){
+ d[i + (0*dWidth)] = i;
+ }
+ for (j = 0; j <= m; j++){
+ d[0 + (j*dWidth)] = j;
+ }
+
+ int32_t __t; //temporary variable for min3
+
+ // start computing edit distance
+ TCHAR s_i; // ith character of s
+ for (i = 1; i <= n; i++) {
+ int32_t bestPossibleEditDistance = m;
+ s_i = text[i - 1];
+ for (j = 1; j <= m; j++) {
+ if (s_i != target[j-1]) {
+ min3(d[i-1 + (j*dWidth)], d[i + ((j-1)*dWidth)], d[i-1 + ((j-1)*dWidth)]);
+ d[i + (j*dWidth)] = __t+1;
+ }
+ else {
+ min3(d[i-1 + (j*dWidth)]+1, d[i + ((j-1)*dWidth)]+1, d[i-1 + ((j-1)*dWidth)]);
+ d[i + (j*dWidth)] = __t;
+ }
+ bestPossibleEditDistance = cl_min(bestPossibleEditDistance, d[i + (j*dWidth)]);
+ }
+
+ //After calculating row i, the best possible edit distance
+ //can be found by finding the smallest value in a given column.
+ //If the bestPossibleEditDistance is greater than the max distance, abort.
+
+ if (i > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater
+ //the closest the target can be to the text is just too far away.
+ //this target is leaving the party early.
+ return 0.0f;
+ }
+ }
+
+ // this will return less than 0.0 when the edit distance is
+ // greater than the number of characters in the shorter word.
+ // but this was the formula that was previously used in FuzzyTermEnum,
+ // so it has not been changed (even though minimumSimilarity must be
+ // greater than 0.0)
+ return 1.0f - ((float_t)d[n + m*dWidth] / (float_t) (prefixLength + cl_min(n, m)));
+ }
+
+ int32_t FuzzyTermEnum::getMaxDistance(const size_t m) {
+ return (m < LUCENE_TYPICAL_LONGEST_WORD_IN_INDEX) ? maxDistances[m] : calculateMaxDistance(m);
+ }
+
+ void FuzzyTermEnum::initializeMaxDistances() {
+ for (int32_t i = 0; i < LUCENE_TYPICAL_LONGEST_WORD_IN_INDEX; i++) {
+ maxDistances[i] = calculateMaxDistance(i);
+ }
+ }
+
+ int32_t FuzzyTermEnum::calculateMaxDistance(const size_t m) const {
+ return (int32_t) ((1-minimumSimilarity) * (cl_min(textLen, m) + prefixLength));
+ }
+ /* LEGACY:
+ int32_t FuzzyTermEnum::editDistance(const TCHAR* s, const TCHAR* t, const int32_t n, const int32_t m) {
+ //Func - Calculates the Levenshtein distance also known as edit distance is a measure of similiarity
+ // between two strings where the distance is measured as the number of character
+ // deletions, insertions or substitutions required to transform one string to
+ // the other string.
+ //Pre - s != NULL and contains the source string
+ // t != NULL and contains the target string
+ // n >= 0 and contains the length of the source string
+ // m >= 0 and containts the length of the target string
+ //Post - The distance has been returned
- int32_t FuzzyTermEnum::editDistance(const TCHAR* s, const TCHAR* t, const int32_t n, const int32_t m) {
- //Func - Calculates the Levenshtein distance also known as edit distance is a measure of similiarity
- // between two strings where the distance is measured as the number of character
- // deletions, insertions or substitutions required to transform one string to
- // the other string.
- //Pre - s != NULL and contains the source string
- // t != NULL and contains the target string
- // n >= 0 and contains the length of the source string
- // m >= 0 and containts the length of th target string
- //Post - The distance has been returned
+ CND_PRECONDITION(s != NULL, "s is NULL");
+ CND_PRECONDITION(t != NULL, "t is NULL");
+ CND_PRECONDITION(n >= 0," n is a negative number");
+ CND_PRECONDITION(n >= 0," n is a negative number");
- CND_PRECONDITION(s != NULL, "s is NULL");
- CND_PRECONDITION(t != NULL, "t is NULL");
- CND_PRECONDITION(n >= 0," n is a negative number");
- CND_PRECONDITION(n >= 0," n is a negative number");
+ int32_t i; // iterates through s
+ int32_t j; // iterates through t
+ TCHAR s_i; // ith character of s
- int32_t i; // iterates through s
- int32_t j; // iterates through t
- TCHAR s_i; // ith character of s
+ if (n == 0)
+ return m;
+ if (m == 0)
+ return n;
- if (n == 0)
- return m;
- if (m == 0)
- return n;
+ //Check if the array must be reallocated because it is too small or does not exist
+ if (e == NULL || eWidth <= n || eHeight <= m) {
+ //Delete e if possible
+ _CLDELETE_ARRAY(e);
+ //resize e
+ eWidth = cl_max(eWidth, n+1);
+ eHeight = cl_max(eHeight, m+1);
+ e = _CL_NEWARRAY(int32_t,eWidth*eHeight);
+ }
- //Check if the array must be reallocated because it is too small or does not exist
- if (e == NULL || eWidth <= n || eHeight <= m) {
- //Delete e if possible
- _CLDELETE_ARRAY(e);
- //resize e
- eWidth = cl_max(eWidth, n+1);
- eHeight = cl_max(eHeight, m+1);
- e = _CL_NEWARRAY(int32_t,eWidth*eHeight);
- }
-
- CND_CONDITION(e != NULL,"e is NULL");
+ CND_CONDITION(e != NULL,"e is NULL");
- // init matrix e
- for (i = 0; i <= n; i++){
- e[i + (0*eWidth)] = i;
- }
- for (j = 0; j <= m; j++){
- e[0 + (j*eWidth)] = j;
- }
+ // init matrix e
+ for (i = 0; i <= n; i++){
+ e[i + (0*eWidth)] = i;
+ }
+ for (j = 0; j <= m; j++){
+ e[0 + (j*eWidth)] = j;
+ }
- int32_t __t; //temporary variable for min3
+ int32_t __t; //temporary variable for min3
- // start computing edit distance
- for (i = 1; i <= n; i++) {
- s_i = s[i - 1];
- for (j = 1; j <= m; j++) {
- if (s_i != t[j-1]){
- min3(e[i + (j*eWidth) - 1], e[i + ((j-1)*eWidth)], e[i + ((j-1)*eWidth)-1]);
- e[i + (j*eWidth)] = __t+1;
- }else{
- min3(e[i + (j*eWidth) -1]+1, e[i + ((j-1)*eWidth)]+1, e[i + ((j-1)*eWidth)-1]);
- e[i + (j*eWidth)] = __t;
+ // start computing edit distance
+ for (i = 1; i <= n; i++) {
+ s_i = s[i - 1];
+ for (j = 1; j <= m; j++) {
+ if (s_i != t[j-1]){
+ min3(e[i + (j*eWidth) - 1], e[i + ((j-1)*eWidth)], e[i + ((j-1)*eWidth)-1]);
+ e[i + (j*eWidth)] = __t+1;
+ }else{
+ min3(e[i + (j*eWidth) -1]+1, e[i + ((j-1)*eWidth)]+1, e[i + ((j-1)*eWidth)-1]);
+ e[i + (j*eWidth)] = __t;
+ }
}
- }
- }
+ }
- // we got the result!
- return e[n + ((m)*eWidth)];
- }
+ // we got the result!
+ return e[n + ((m)*eWidth)];
+ }*/
+ class FuzzyQuery::ScoreTerm {
+ public:
+ Term* term;
+ float_t score;
- /**
- * Create a new FuzzyQuery that will match terms with a similarity
- * of at least <code>minimumSimilarity</code> to <code>term</code>.
- * If a <code>prefixLength</code> > 0 is specified, a common prefix
- * of that length is also required.
- *
- * @param term the term to search for
- * @param minimumSimilarity a value between 0 and 1 to set the required similarity
- * between the query term and the matching terms. For example, for a
- * <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length
- * as the query term is considered similar to the query term if the edit distance
- * between both terms is less than <code>length(term)*0.5</code>
- * @param prefixLength length of common (non-fuzzy) prefix
- * @throws IllegalArgumentException if minimumSimilarity is > 1 or < 0
- * or if prefixLength < 0 or > <code>term.text().length()</code>.
- */
- FuzzyQuery::FuzzyQuery(Term* term, float_t minimumSimilarity, size_t prefixLength):
- MultiTermQuery(term)
+ ScoreTerm(Term* _term, float_t _score):term(_term),score(_score){
+ }
+ virtual ~ScoreTerm(){
+ }
+ };
+
+ class FuzzyQuery::ScoreTermQueue : public PriorityQueue<ScoreTerm*, CL_NS(util)::Deletor::Object<ScoreTerm> > {
+ public:
+ ScoreTermQueue(int32_t size){
+ initialize(size, true);
+ }
+ virtual ~ScoreTermQueue(){
+ }
+
+ protected:
+ bool lessThan(ScoreTerm* termA, ScoreTerm* termB) {
+ if (termA->score == termB->score)
+ return termA->term->compareTo(termB->term) > 0;
+ else
+ return termA->score < termB->score;
+ }
+ };
+
+
+ FuzzyQuery::FuzzyQuery(Term* term, float_t _minimumSimilarity, size_t _prefixLength):
+ MultiTermQuery(term),minimumSimilarity(_minimumSimilarity),prefixLength(_prefixLength)
{
- //Func - Constructor
- //Pre - term != NULL
- //Post - The instance has been created
- if ( minimumSimilarity < 0 )
- minimumSimilarity = defaultMinSimilarity;
+ if ( minimumSimilarity < 0 )
+ minimumSimilarity = defaultMinSimilarity;
- CND_PRECONDITION(term != NULL,"term is NULL");
+ CND_PRECONDITION(term != NULL,"term is NULL");
- if (minimumSimilarity > 1.0f)
- _CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity > 1");
- else if (minimumSimilarity < 0.0f)
+ if (minimumSimilarity >= 1.0f)
+ _CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity >= 1");
+ else if (minimumSimilarity < 0.0f)
_CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity < 0");
-
- this->minimumSimilarity = minimumSimilarity;
-
- if(prefixLength >= term->textLength())
- _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength >= term.textLength()");
- this->prefixLength = prefixLength;
+ if (prefixLength < 0)
+ _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength < 0");
- }
+ /*
+ TODO: Not in original Java version
+ if(prefixLength >= term->textLength())
+ _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength >= term.textLength()");
+ */
+ }
-
- float_t FuzzyQuery::defaultMinSimilarity = 0.5f;
- int32_t FuzzyQuery::defaultPrefixLength = 0;
+ float_t FuzzyQuery::defaultMinSimilarity = 0.5f;
+ int32_t FuzzyQuery::defaultPrefixLength = 0;
- FuzzyQuery::~FuzzyQuery(){
- //Func - Destructor
- //Pre - true
- //Post - Instance has been destroyed
- }
+ FuzzyQuery::~FuzzyQuery(){
+ }
- TCHAR* FuzzyQuery::toString(const TCHAR* field) const{
- StringBuffer buffer(100, false); // TODO: Have a better estimation for the initial buffer length
- Term* term = getTerm(false); // no need to increase ref count
- if ( field==NULL || _tcscmp(term->field(),field)!=0 ) {
- buffer.append(term->field());
- buffer.append( _T(":"));
- }
- buffer.append(term->text());
- buffer.append( _T("~") );
- buffer.appendFloat(minimumSimilarity,1);
- // todo: use ToStringUtils.boost()
- if (getBoost() != 1.0f) {
- buffer.appendChar ( '^' );
- buffer.appendFloat( getBoost(),1);
- }
- return buffer.getBuffer();
- }
+ float_t FuzzyQuery::getMinSimilarity() const {
+ return minimumSimilarity;
+ }
+ size_t FuzzyQuery::getPrefixLength() const {
+ return prefixLength;
+ }
+
+ TCHAR* FuzzyQuery::toString(const TCHAR* field) const{
+ StringBuffer buffer(100, false); // TODO: Have a better estimation for the initial buffer length
+ Term* term = getTerm(false); // no need to increase ref count
+ if ( field==NULL || _tcscmp(term->field(),field)!=0 ) {
+ buffer.append(term->field());
+ buffer.appendChar( _T(':'));
+ }
+ buffer.append(term->text());
+ buffer.appendChar( _T('~') );
+ buffer.appendFloat(minimumSimilarity,1);
+ buffer.appendBoost(getBoost());
+ return buffer.getBuffer();
+ }
+
const char* FuzzyQuery::getObjectName() const{
- //Func - Returns the name of the query
- //Pre - true
- //post - The string FuzzyQuery has been returned
+ //Func - Returns the name of the query
+ //Pre - true
+ //post - The string FuzzyQuery has been returned
- return getClassName();
+ return getClassName();
}
const char* FuzzyQuery::getClassName(){
- //Func - Returns the name of the query
- //Pre - true
- //post - The string FuzzyQuery has been returned
+ //Func - Returns the name of the query
+ //Pre - true
+ //post - The string FuzzyQuery has been returned
- return "FuzzyQuery";
+ return "FuzzyQuery";
}
-
- /**
- * Returns the minimum similarity that is required for this query to match.
- * @return float value between 0.0 and 1.0
- */
- float_t FuzzyQuery::getMinSimilarity() const {
- return minimumSimilarity;
- }
-
FuzzyQuery::FuzzyQuery(const FuzzyQuery& clone):
- MultiTermQuery(clone)
- {
+ MultiTermQuery(clone)
+ {
this->minimumSimilarity = clone.getMinSimilarity();
this->prefixLength = clone.getPrefixLength();
-
- //if(prefixLength < 0)
- // _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength < 0");
- //else
- if(prefixLength >= clone.getTerm()->textLength())
- _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength >= term.textLength()");
- }
+ //if(prefixLength < 0)
+ // _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength < 0");
+ //else
+ if(prefixLength >= clone.getTerm()->textLength())
+ _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength >= term.textLength()");
+ }
+
Query* FuzzyQuery::clone() const{
- return _CLNEW FuzzyQuery(*thi...
[truncated message content] |