[Clucene-cvs] SF.net SVN: clucene:[3013] branches/lucene2_3_2/src/core

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3013
          http://clucene.svn.sourceforge.net/clucene/?rev=3013&view=rev
Author:   ustramooner
Date:     2009-07-08 09:54:46 +0000 (Wed, 08 Jul 2009)

Log Message:
-----------
Updated FuzzyQuery to conform with JL, should work better now. Old code similarity code - which looks to be optimized but wasn't working properly, is commented out for future optimizations reference

Modified Paths:
--------------
    branches/lucene2_3_2/src/core/CLucene/CLConfig.h
    branches/lucene2_3_2/src/core/CLucene/search/FilteredTermEnum.cpp
    branches/lucene2_3_2/src/core/CLucene/search/FilteredTermEnum.h
    branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.cpp
    branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.h
    branches/lucene2_3_2/src/core/CLucene/search/_PhraseQueue.h
    branches/lucene2_3_2/src/core/CLucene/util/PriorityQueue.h
    branches/lucene2_3_2/src/core/files_list.txt

Modified: branches/lucene2_3_2/src/core/CLucene/CLConfig.h
===================================================================

--- branches/lucene2_3_2/src/core/CLucene/CLConfig.h	2009-07-08 09:53:49 UTC (rev 3012)
+++ branches/lucene2_3_2/src/core/CLucene/CLConfig.h	2009-07-08 09:54:46 UTC (rev 3013)
@@ -202,5 +202,19 @@
 //
 ////////////////////////////////////////////////////////////////////
 
+
+////////////////////////////////////////////////////////////////////
+//   FuzzyQuery settings
+////////////////////////////////////////////////////////////////////
+//
+//	This should be somewhere around the average long word.
+//	If it is longer, we waste time and space. If it is shorter, we waste a
+//	little bit of time growing the array as we encounter longer words.
+//	
+#define LUCENE_TYPICAL_LONGEST_WORD_IN_INDEX 19
+//
+////////////////////////////////////////////////////////////////////
+
+
 #endif
 

Modified: branches/lucene2_3_2/src/core/CLucene/search/FilteredTermEnum.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/search/FilteredTermEnum.cpp	2009-07-08 09:53:49 UTC (rev 3012)
+++ branches/lucene2_3_2/src/core/CLucene/search/FilteredTermEnum.cpp	2009-07-08 09:54:46 UTC (rev 3013)
@@ -12,14 +12,8 @@
 CL_NS_DEF(search)
 
 
-    FilteredTermEnum::FilteredTermEnum(){
-	//Func - Constructor
-	//Pre  - true
-	//Post - Instance has been created
-		
-        currentTerm = NULL;
-        actualEnum = NULL;
-    }
+FilteredTermEnum::FilteredTermEnum():currentTerm(NULL),actualEnum(NULL){
+}
 
     FilteredTermEnum::~FilteredTermEnum() {
     //Func - Destructor
@@ -48,7 +42,7 @@
 		//The actual enumerator is not initialized!
 		if (actualEnum == NULL){
 			return false; 
-		    }
+		}
 
 		//Finalize the currentTerm and reset it to NULL
        _CLDECDELETE( currentTerm );
@@ -101,12 +95,11 @@
 		//Check if actualEnum is valid
 		if (actualEnum){
 			//Close the enumeration
-            actualEnum->close();
-		    }
+			actualEnum->close();
+			//Destroy the enumeration
+			_CLDELETE(actualEnum);
+		}
 
-        //Destroy the enumeration
-        _CLDELETE(actualEnum);
-
 		//Destroy currentTerm
         _CLDECDELETE(currentTerm);
     }
@@ -118,8 +111,7 @@
 
 		CND_PRECONDITION(actualEnum != NULL,"actualEnum is NULL");
 
-		_CLDELETE(this->actualEnum);
-
+		_CLLDELETE(this->actualEnum);
         this->actualEnum = actualEnum;
 
         // Find the first term that matches

Modified: branches/lucene2_3_2/src/core/CLucene/search/FilteredTermEnum.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/search/FilteredTermEnum.h	2009-07-08 09:53:49 UTC (rev 3012)
+++ branches/lucene2_3_2/src/core/CLucene/search/FilteredTermEnum.h	2009-07-08 09:54:46 UTC (rev 3013)
@@ -12,47 +12,48 @@
 #include "CLucene/index/Terms.h"
 
 CL_NS_DEF(search)
-  //FilteredTermEnum is an abstract class for enumerating a subset of all terms. 
-  //
-  //Term enumerations are always ordered by term->compareTo().  Each term in
-  //the enumeration is greater than all that precede it. 
-  
-  class CLUCENE_EXPORT FilteredTermEnum: public CL_NS(index)::TermEnum {
-  public:
-      //Constructor
-      FilteredTermEnum();
-	  //Destructor
-      virtual ~FilteredTermEnum();
-        
-      //Equality measure on the term
-      virtual float_t difference() = 0;
+/** Abstract class for enumerating a subset of all terms. 
 
-      //Returns the docFreq of the current Term in the enumeration.
-      int32_t docFreq() const ;
-        
-      //Increments the enumeration to the next element
-      bool next() ;
-        
-      //Returns a pointer to the current Term in the enumeration.
-      CL_NS(index)::Term* term();
-      CL_NS(index)::Term* term(bool pointer);
-        
-      //Closes the enumeration to further activity, freeing resources.
-      void close();
+<p>Term enumerations are always ordered by Term.compareTo().  Each term in
+the enumeration is greater than all that precede it.  */
+class CLUCENE_EXPORT FilteredTermEnum: public CL_NS(index)::TermEnum {
+public:
+	FilteredTermEnum();
+	virtual ~FilteredTermEnum();
 
-    protected:
-      //Equality compare on the term */
-      virtual bool termCompare(CL_NS(index)::Term* term) = 0;
-        
-      //Indiciates the end of the enumeration has been reached
-      virtual bool endEnum() = 0;
-        
-      void setEnum(CL_NS(index)::TermEnum* actualEnum) ;
-    
-    private:
-        CL_NS(index)::Term* currentTerm;
-        CL_NS(index)::TermEnum* actualEnum;
-        
-    };
+	/** Equality measure on the term */
+	virtual float_t difference() = 0;
+
+	/** 
+	* Returns the docFreq of the current Term in the enumeration.
+	* Returns -1 if no Term matches or all terms have been enumerated.
+	*/
+	int32_t docFreq() const;
+
+	/** Increments the enumeration to the next element.  True if one exists. */
+	bool next() ;
+
+	/** Returns the current Term in the enumeration.
+	* Returns null if no Term matches or all terms have been enumerated. */
+	CL_NS(index)::Term* term(bool pointer);
+	CL_NS(index)::Term* term();
+
+	/** Closes the enumeration to further activity, freeing resources.  */
+	void close();
+
+protected:
+	/** Equality compare on the term */
+	virtual bool termCompare(CL_NS(index)::Term* term) = 0;
+
+	/** Indicates the end of the enumeration has been reached */
+	virtual bool endEnum() = 0;
+
+	void setEnum(CL_NS(index)::TermEnum* actualEnum) ;
+
+private:
+	CL_NS(index)::Term* currentTerm;
+	CL_NS(index)::TermEnum* actualEnum;
+
+};
 CL_NS_END
 #endif

Modified: branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.cpp	2009-07-08 09:53:49 UTC (rev 3012)
+++ branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.cpp	2009-07-08 09:54:46 UTC (rev 3013)
@@ -8,366 +8,459 @@
 #include "CLucene/index/Term.h"
 #include "CLucene/index/IndexReader.h"
 #include "Similarity.h"
-#include "CLucene/util/StringBuffer.h"
 #include "FuzzyQuery.h"
+#include "BooleanQuery.h"
+#include "BooleanClause.h"
+#include "TermQuery.h"
 
+#include "CLucene/util/StringBuffer.h"
+#include "CLucene/util/PriorityQueue.h"
+
 CL_NS_USE(index)
 CL_NS_USE(util)
 CL_NS_DEF(search)
 
   
-	/** Finds and returns the smallest of three integers 
-		precondition: Must define int32_t __t for temporary storage and result
-	*/
-	#define min3(a, b, c) __t = (a < b) ? a : b; __t = (__t < c) ? __t : c;
+/** Finds and returns the smallest of three integers 
+ *	precondition: Must define int32_t __t for temporary storage and result
+ */
+#define min3(a, b, c) __t = (a < b) ? a : b; __t = (__t < c) ? __t : c;
 
 
-	/**
-     * Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
-     * length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity &gt;
-     * <code>minSimilarity</code>. 
-     * 
-     * @param reader Delivers terms.
-     * @param term Pattern term.
-     * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f.
-     * @param prefixLength Length of required common prefix. Default value is 0.
-     * @throws IOException
-     */
-	 FuzzyTermEnum::FuzzyTermEnum(IndexReader* reader, Term* term, float_t minSimilarity, size_t prefixLength): 
-        distance(0),
-        _endEnum(false),
-		prefix(STRDUP_TtoT(LUCENE_BLANK_STRING)),
-		prefixLength(0),
+	FuzzyTermEnum::FuzzyTermEnum(IndexReader* reader, Term* term, float_t minSimilarity, size_t _prefixLength): 
+		FilteredTermEnum(),d(NULL),dWidth(0),dHeight(0),_similarity(0),_endEnum(false),searchTerm(_CL_POINTER(term)),
+		text(NULL),textLen(0),prefix(NULL)/* ISH: was STRDUP_TtoT(LUCENE_BLANK_STRING)*/,prefixLength(_prefixLength),
 		minimumSimilarity(minSimilarity)
 	{
-	//Func - Constructor
-	//Pre  - reader contains a valid reference to an IndexReader
-	//       term != NULL
-	//Post - The instance has been created
+		CND_PRECONDITION(term != NULL,"term is NULL");
 
-		CND_PRECONDITION(term != NULL,"term is NULL");
-		
-		scale_factor = 1.0f / (1.0f - minimumSimilarity);
-		searchTerm = _CL_POINTER(term);
-		
-		text = STRDUP_TtoT(term->text());
-		textLen = term->textLength();
-		
-		
+		if (minSimilarity >= 1.0f)
+			_CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity cannot be greater than or equal to 1");
+		else if (minSimilarity < 0.0f)
+			_CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity cannot be less than 0");
+		if(_prefixLength < 0)
+			_CLTHROWA(CL_ERR_IllegalArgument,"prefixLength cannot be less than 0");
+
+		scale_factor = 1.0f / (1.0f - minimumSimilarity); // only now we are safe from a division by zero
+		//TODO: this.field = searchTerm.field();
+
+		//The prefix could be longer than the word.
+		//It's kind of silly though.  It means we must match the entire word.
+		const size_t fullSearchTermLength = searchTerm->textLength();
+		const size_t realPrefixLength = prefixLength > fullSearchTermLength ? fullSearchTermLength : prefixLength;
+
+		text = STRDUP_TtoT(searchTerm->text() + realPrefixLength);
+		textLen = fullSearchTermLength - realPrefixLength;
+
+		// TODO: what is safer to use, prefixLength or realPrefixLength?
+		prefix = _CL_NEWARRAY(TCHAR,realPrefixLength+1);
+		_tcsncpy(prefix, searchTerm->text(), realPrefixLength);
+		prefix[realPrefixLength]='\0';
+
+		initializeMaxDistances();
+		dWidth = LUCENE_TYPICAL_LONGEST_WORD_IN_INDEX; // default length of the d array
+		dHeight = textLen + 1;
+
+		Term* trm = _CLNEW Term(searchTerm->field(), prefix, true); // _CLNEW Term(term, prefix); -- not intern'd?
+		setEnum(reader->terms(trm));
+		_CLDECDELETE(trm);
+
+
+		/* LEGACY:
 		//Initialize e to NULL
 		e          = NULL;
 		eWidth     = 0;
 		eHeight    = 0;
-		
+
 		if(prefixLength > 0 && prefixLength < textLen){
-			this->prefixLength = prefixLength;
-		
-			prefix = _CL_NEWARRAY(TCHAR,prefixLength+1);
-			_tcsncpy(prefix,text,prefixLength);
-			prefix[prefixLength]='\0';
-		
-			textLen = prefixLength;
-			text[textLen]='\0';
-		}
-		
-		
-		//Set the enumeration 
-		Term* trm = _CLNEW Term(term, prefix);
-		setEnum(reader->terms(trm));
-		_CLDECDELETE(trm);
-  }
+		this->prefixLength = prefixLength;
 
-  FuzzyTermEnum::~FuzzyTermEnum(){
-  //Func - Destructor
-  //Pre  - true
-  //Post - FuzzyTermEnum has been destroyed
+		prefix = _CL_NEWARRAY(TCHAR,prefixLength+1);
+		_tcsncpy(prefix,text,prefixLength);
+		prefix[prefixLength]='\0';
 
-	  //Close the enumeration
-	  close();
-  }
+		textLen = prefixLength;
+		text[textLen]='\0';
+		}
+		*/
+	}
+	
+	FuzzyTermEnum::~FuzzyTermEnum(){
+		close();
+	}
 
 	const char* FuzzyTermEnum::getObjectName() const{ return getClassName(); }
 	const char* FuzzyTermEnum::getClassName(){ return "FuzzyTermEnum"; }
 
-  bool FuzzyTermEnum::endEnum() {
-  //Func - Returns the fact if the current term in the enumeration has reached the end
-  //Pre  - true
-  //Post - The boolean value of endEnum has been returned
+	bool FuzzyTermEnum::endEnum() {
+		return _endEnum;
+	}
 
-      return _endEnum;
-  }
+	void FuzzyTermEnum::close(){
 
-  void FuzzyTermEnum::close(){
-  //Func - Close the enumeration
-  //Pre  - true
-  //Post - The enumeration has been closed
+		FilteredTermEnum::close();
 
-      FilteredTermEnum::close();
-	  
-      //Finalize the searchTerm
-      _CLDECDELETE(searchTerm);
-	  //Destroy e
-      _CLDELETE_ARRAY(e);
+		//Finalize the searchTerm
+		_CLDECDELETE(searchTerm);
 
-	  _CLDELETE_CARRAY(text);
+		free(d);
+		d=NULL;
 
-	  _CLDELETE_CARRAY(prefix);
-  }
+		_CLDELETE_CARRAY(text);
 
-  bool FuzzyTermEnum::termCompare(Term* term) {
-  //Func - Compares term with the searchTerm using the Levenshtein distance.
-  //Pre  - term is NULL or term points to a Term
-  //Post - if pre(term) is NULL then false is returned otherwise
-  //       if the distance of the current term in the enumeration is bigger than the FUZZY_THRESHOLD
-  //       then true is returned 
-	  
-	  if (term == NULL){
-		  return false;  //Note that endEnum is not set to true!
-	  }
+		_CLDELETE_CARRAY(prefix);
+	}
 
-	  const TCHAR* termText = term->text();
-	  size_t termTextLen = term->textLength();
+	bool FuzzyTermEnum::termCompare(Term* term) {
+		//Func - Compares term with the searchTerm using the Levenshtein distance.
+		//Pre  - term is NULL or term points to a Term
+		//Post - if pre(term) is NULL then false is returned otherwise
+		//       if the distance of the current term in the enumeration is bigger than the FUZZY_THRESHOLD
+		//       then true is returned 
 
-		  //Check if the field name of searchTerm of term match
-		  //(we can use == because fields are interned)
-      if ( searchTerm->field() == term->field() && 
-		  	(prefixLength==0 || _tcsncmp(termText,prefix,prefixLength)==0 )) {
+		if (term == NULL){
+			return false;  //Note that endEnum is not set to true!
+		}
 
-			const TCHAR* target = termText+prefixLength;
-			size_t targetLen = termTextLen-prefixLength;
+		const TCHAR* termText = term->text();
+		const size_t termTextLen = term->textLength();
 
-		    //Calculate the Levenshtein distance
-			int32_t dist = editDistance(text, target, textLen, targetLen);
-			distance = 1 - ((float_t)dist / (float_t)cl_min(textLen, targetLen));
-			return (distance > minimumSimilarity);
-      }
+		//Check if the field name of searchTerm of term match
+		//(we can use == because fields are interned)
+		if ( searchTerm->field() == term->field() && 
+			(prefixLength==0 || _tcsncmp(termText,prefix,prefixLength)==0 )) {
+
+				const TCHAR* target = termText+prefixLength;
+				const size_t targetLen = termTextLen-prefixLength;
+				_similarity = similarity(target, targetLen);
+				return (_similarity > minimumSimilarity);
+
+				/* LEGACY:
+				//Calculate the Levenshtein distance
+				int32_t dist = editDistance(text, target, textLen, targetLen);
+				distance = 1 - ((float_t)dist / (float_t)cl_min(textLen, targetLen));
+				return (distance > minimumSimilarity);
+				*/
+		}
 		_endEnum = true;
 		return false;
-  }
+	}
 
-  float_t FuzzyTermEnum::difference() {
-  //Func - Returns the difference between the distance and the fuzzy threshold
-  //       multiplied by the scale factor
-  //Pre  - true
-  //Post - The difference is returned
+	float_t FuzzyTermEnum::difference() {
+		return (float_t)((_similarity - minimumSimilarity) * scale_factor );
+	}
 
-     return (float_t)((distance - minimumSimilarity) * scale_factor );
-  }
+	// TODO: had synchronized in definition
+	float_t FuzzyTermEnum::similarity(const TCHAR* target, const size_t m) {
+		const size_t n = textLen; // TODO: remove after replacing n with textLen
+		if (n == 0)  {
+			//we don't have anything to compare.  That means if we just add
+			//the letters for m we get the new word
+			return prefixLength == 0 ? 0.0f : 1.0f - ((float_t) m / prefixLength);
+		}
+		if (m == 0) {
+			return prefixLength == 0 ? 0.0f : 1.0f - ((float_t) n / prefixLength);
+		}
+
+		const int32_t maxDistance = getMaxDistance(m);
+
+		if (maxDistance < abs((int32_t)(m-n))) {
+			//just adding the characters of m to n or vice-versa results in
+			//too many edits
+			//for example "pre" length is 3 and "prefixes" length is 8.  We can see that
+			//given this optimal circumstance, the edit distance cannot be less than 5.
+			//which is 8-3 or more precisesly Math.abs(3-8).
+			//if our maximum edit distance is 4, then we can discard this word
+			//without looking at it.
+			return 0.0f;
+		}
+
+		//let's make sure we have enough room in our array to do the distance calculations.
+		//Check if the array must be reallocated because it is too small or does not exist
+
+		// TODO:	realloc should be able to allocate memory for NULL pointers; if thats the case the NULL
+		//			check here is redundant
+		if (d == NULL){
+			dWidth  = cl_max(dWidth, n+1);
+			dHeight = cl_max(dHeight, m+1);
+			d = reinterpret_cast<int32_t*>(malloc(sizeof(int32_t)*dWidth*dHeight));
+		} else if (dWidth <= n || dHeight <= m) {
+			//growDistanceArray
+			dWidth  = cl_max(dWidth, n+1);
+			dHeight = cl_max(dHeight, m+1);
+			d = reinterpret_cast<int32_t*>(realloc(d, sizeof(int32_t)*dWidth*dHeight));
+		}
+
+		size_t i;     // iterates through the source string
+		size_t j;     // iterates through the target string
+
+		// init matrix d
+		for (i = 0; i <= n; i++){
+			d[i + (0*dWidth)] = i;
+		}
+		for (j = 0; j <= m; j++){
+			d[0 + (j*dWidth)] = j;
+		}
+
+		int32_t __t; //temporary variable for min3
+
+		// start computing edit distance
+		TCHAR s_i; // ith character of s
+		for (i = 1; i <= n; i++) {
+			int32_t bestPossibleEditDistance = m;
+			s_i = text[i - 1];
+			for (j = 1; j <= m; j++) {
+				if (s_i != target[j-1]) {
+					min3(d[i-1 + (j*dWidth)], d[i + ((j-1)*dWidth)], d[i-1 + ((j-1)*dWidth)]);
+					d[i + (j*dWidth)] = __t+1;
+				}
+				else {
+					min3(d[i-1 + (j*dWidth)]+1, d[i + ((j-1)*dWidth)]+1, d[i-1 + ((j-1)*dWidth)]);
+					d[i + (j*dWidth)] = __t;
+				}
+				bestPossibleEditDistance = cl_min(bestPossibleEditDistance, d[i + (j*dWidth)]);
+			}
+
+			//After calculating row i, the best possible edit distance
+			//can be found by finding the smallest value in a given column.
+			//If the bestPossibleEditDistance is greater than the max distance, abort.
+
+			if (i > maxDistance && bestPossibleEditDistance > maxDistance) {  //equal is okay, but not greater
+				//the closest the target can be to the text is just too far away.
+				//this target is leaving the party early.
+				return 0.0f;
+			}
+		}
+
+		// this will return less than 0.0 when the edit distance is
+		// greater than the number of characters in the shorter word.
+		// but this was the formula that was previously used in FuzzyTermEnum,
+		// so it has not been changed (even though minimumSimilarity must be
+		// greater than 0.0)
+		return 1.0f - ((float_t)d[n + m*dWidth] / (float_t) (prefixLength + cl_min(n, m)));
+	}
+
+	int32_t FuzzyTermEnum::getMaxDistance(const size_t m) {
+		return (m < LUCENE_TYPICAL_LONGEST_WORD_IN_INDEX) ? maxDistances[m] : calculateMaxDistance(m);
+	}
+
+	void FuzzyTermEnum::initializeMaxDistances() {
+		for (int32_t i = 0; i < LUCENE_TYPICAL_LONGEST_WORD_IN_INDEX; i++) {
+			maxDistances[i] = calculateMaxDistance(i);
+		}
+	}
+
+	int32_t FuzzyTermEnum::calculateMaxDistance(const size_t m) const {
+		return (int32_t) ((1-minimumSimilarity) * (cl_min(textLen, m) + prefixLength));
+	}
   
+	/* LEGACY:
+	int32_t FuzzyTermEnum::editDistance(const TCHAR* s, const TCHAR* t, const int32_t n, const int32_t m) {
+		//Func - Calculates the Levenshtein distance also known as edit distance is a measure of similiarity
+		//       between two strings where the distance is measured as the number of character
+		//       deletions, insertions or substitutions required to transform one string to
+		//       the other string.
+		//Pre  - s != NULL and contains the source string
+		//       t != NULL and contains the target string
+		//       n >= 0 and contains the length of the source string
+		//       m >= 0 and containts the length of the target string
+		//Post - The distance has been returned
 
-  int32_t FuzzyTermEnum::editDistance(const TCHAR* s, const TCHAR* t, const int32_t n, const int32_t m) {
-  //Func - Calculates the Levenshtein distance also known as edit distance is a measure of similiarity
-  //       between two strings where the distance is measured as the number of character
-  //       deletions, insertions or substitutions required to transform one string to
-  //       the other string.
-  //Pre  - s != NULL and contains the source string
-  //       t != NULL and contains the target string
-  //       n >= 0 and contains the length of the source string
-  //       m >= 0 and containts the length of th target string
-  //Post - The distance has been returned
+		CND_PRECONDITION(s != NULL, "s is NULL");
+		CND_PRECONDITION(t != NULL, "t is NULL");
+		CND_PRECONDITION(n >= 0," n is a negative number");
+		CND_PRECONDITION(n >= 0," n is a negative number");
 
-      CND_PRECONDITION(s != NULL, "s is NULL");
-      CND_PRECONDITION(t != NULL, "t is NULL");
-	  CND_PRECONDITION(n >= 0," n is a negative number");
-	  CND_PRECONDITION(n >= 0," n is a negative number");
+		int32_t i;     // iterates through s
+		int32_t j;     // iterates through t
+		TCHAR s_i; // ith character of s
 
-      int32_t i;     // iterates through s
-      int32_t j;     // iterates through t
-      TCHAR s_i; // ith character of s
+		if (n == 0) 
+			return m;
+		if (m == 0) 
+			return n;
 
-      if (n == 0) 
-          return m;
-      if (m == 0) 
-          return n;
+		//Check if the array must be reallocated because it is too small or does not exist
+		if (e == NULL || eWidth <= n || eHeight <= m) {
+			//Delete e if possible
+			_CLDELETE_ARRAY(e);
+			//resize e
+			eWidth  = cl_max(eWidth, n+1);
+			eHeight = cl_max(eHeight, m+1);
+			e = _CL_NEWARRAY(int32_t,eWidth*eHeight);
+		}
 
-	//Check if the array must be reallocated because it is too small or does not exist
-    if (e == NULL || eWidth <= n || eHeight <= m) {
-        //Delete e if possible
-        _CLDELETE_ARRAY(e);
-        //resize e
-		eWidth  = cl_max(eWidth, n+1);
-        eHeight = cl_max(eHeight, m+1);
-        e = _CL_NEWARRAY(int32_t,eWidth*eHeight);
-    }
-    
-    CND_CONDITION(e != NULL,"e is NULL");
+		CND_CONDITION(e != NULL,"e is NULL");
 
-    // init matrix e
-	for (i = 0; i <= n; i++){
-        e[i + (0*eWidth)] = i;
-    }
-	for (j = 0; j <= m; j++){
-        e[0 + (j*eWidth)] = j;
-    }
+		// init matrix e
+		for (i = 0; i <= n; i++){
+			e[i + (0*eWidth)] = i;
+		}
+		for (j = 0; j <= m; j++){
+			e[0 + (j*eWidth)] = j;
+		}
 
-	int32_t __t; //temporary variable for min3
+		int32_t __t; //temporary variable for min3
 
-    // start computing edit distance
-    for (i = 1; i <= n; i++) {
-        s_i = s[i - 1];
-        for (j = 1; j <= m; j++) {
-			if (s_i != t[j-1]){
-				min3(e[i + (j*eWidth) - 1], e[i + ((j-1)*eWidth)], e[i + ((j-1)*eWidth)-1]);
-                e[i + (j*eWidth)] = __t+1;
-			}else{
-				min3(e[i + (j*eWidth) -1]+1, e[i + ((j-1)*eWidth)]+1, e[i + ((j-1)*eWidth)-1]);
-                e[i + (j*eWidth)] = __t;
+		// start computing edit distance
+		for (i = 1; i <= n; i++) {
+			s_i = s[i - 1];
+			for (j = 1; j <= m; j++) {
+				if (s_i != t[j-1]){
+					min3(e[i + (j*eWidth) - 1], e[i + ((j-1)*eWidth)], e[i + ((j-1)*eWidth)-1]);
+					e[i + (j*eWidth)] = __t+1;
+				}else{
+					min3(e[i + (j*eWidth) -1]+1, e[i + ((j-1)*eWidth)]+1, e[i + ((j-1)*eWidth)-1]);
+					e[i + (j*eWidth)] = __t;
+				}
 			}
-        }
-    }
+		}
 
-    // we got the result!
-    return e[n + ((m)*eWidth)];
-  }
+		// we got the result!
+		return e[n + ((m)*eWidth)];
+	}*/
 
+  class FuzzyQuery::ScoreTerm {
+  public:
+	  Term* term;
+	  float_t score;
 
-  /**
-   * Create a new FuzzyQuery that will match terms with a similarity 
-   * of at least <code>minimumSimilarity</code> to <code>term</code>.
-   * If a <code>prefixLength</code> &gt; 0 is specified, a common prefix
-   * of that length is also required.
-   * 
-   * @param term the term to search for
-   * @param minimumSimilarity a value between 0 and 1 to set the required similarity
-   *  between the query term and the matching terms. For example, for a
-   *  <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length
-   *  as the query term is considered similar to the query term if the edit distance
-   *  between both terms is less than <code>length(term)*0.5</code>
-   * @param prefixLength length of common (non-fuzzy) prefix
-   * @throws IllegalArgumentException if minimumSimilarity is &gt; 1 or &lt; 0
-   * or if prefixLength &lt; 0 or &gt; <code>term.text().length()</code>.
-   */
-  FuzzyQuery::FuzzyQuery(Term* term, float_t minimumSimilarity, size_t prefixLength):
-	MultiTermQuery(term)
+	  ScoreTerm(Term* _term, float_t _score):term(_term),score(_score){
+	  }
+	  virtual ~ScoreTerm(){
+	  }
+  };
+
+  class FuzzyQuery::ScoreTermQueue : public PriorityQueue<ScoreTerm*, CL_NS(util)::Deletor::Object<ScoreTerm> > {
+  public:
+	  ScoreTermQueue(int32_t size){
+		  initialize(size, true);
+	  }
+	  virtual ~ScoreTermQueue(){
+	  }
+
+  protected:
+	  bool lessThan(ScoreTerm* termA, ScoreTerm* termB) {
+		  if (termA->score == termB->score)
+			  return termA->term->compareTo(termB->term) > 0;
+		  else
+			  return termA->score < termB->score;
+	  }
+  };
+
+
+  FuzzyQuery::FuzzyQuery(Term* term, float_t _minimumSimilarity, size_t _prefixLength):
+  MultiTermQuery(term),minimumSimilarity(_minimumSimilarity),prefixLength(_prefixLength)
   {
-  //Func - Constructor
-  //Pre  - term != NULL
-  //Post - The instance has been created
-        if ( minimumSimilarity < 0 )
-            minimumSimilarity = defaultMinSimilarity;
+	  if ( minimumSimilarity < 0 )
+		  minimumSimilarity = defaultMinSimilarity;
 
-	    CND_PRECONDITION(term != NULL,"term is NULL");
+	  CND_PRECONDITION(term != NULL,"term is NULL");
 
-	    if (minimumSimilarity > 1.0f)
-		  _CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity > 1");
-        else if (minimumSimilarity < 0.0f)
+	  if (minimumSimilarity >= 1.0f)
+		  _CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity >= 1");
+	  else if (minimumSimilarity < 0.0f)
 		  _CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity < 0");
-    
-	    this->minimumSimilarity = minimumSimilarity;
-    
-		if(prefixLength >= term->textLength())
-			_CLTHROWA(CL_ERR_IllegalArgument,"prefixLength >= term.textLength()");
-		this->prefixLength = prefixLength;
+	  if (prefixLength < 0)
+		  _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength < 0");
 
-    }
+	  /*
+	  TODO: Not in original Java version
+	  if(prefixLength >= term->textLength())
+	  _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength >= term.textLength()");
+	  */
+  }
   
-  
-    float_t FuzzyQuery::defaultMinSimilarity = 0.5f;
-	int32_t FuzzyQuery::defaultPrefixLength = 0;
+  float_t FuzzyQuery::defaultMinSimilarity = 0.5f;
+  int32_t FuzzyQuery::defaultPrefixLength = 0;
 
-    FuzzyQuery::~FuzzyQuery(){
-    //Func - Destructor
-	//Pre  - true
-	//Post - Instance has been destroyed
-    }
+  FuzzyQuery::~FuzzyQuery(){
+  }
 
-    TCHAR* FuzzyQuery::toString(const TCHAR* field) const{
-      StringBuffer buffer(100, false); // TODO: Have a better estimation for the initial buffer length
-      Term* term = getTerm(false); // no need to increase ref count
-      if ( field==NULL || _tcscmp(term->field(),field)!=0 ) {
-        buffer.append(term->field());
-        buffer.append( _T(":"));
-      }
-      buffer.append(term->text());
-      buffer.append( _T("~") );
-      buffer.appendFloat(minimumSimilarity,1);
-      // todo: use ToStringUtils.boost()
-      if (getBoost() != 1.0f) {
-        buffer.appendChar ( '^' );
-        buffer.appendFloat( getBoost(),1);
-      }
-      return buffer.getBuffer();
-    }
+  float_t FuzzyQuery::getMinSimilarity() const {
+    return minimumSimilarity;
+  }
 
+  size_t FuzzyQuery::getPrefixLength() const {
+    return prefixLength;
+  }
+
+  TCHAR* FuzzyQuery::toString(const TCHAR* field) const{
+	  StringBuffer buffer(100, false); // TODO: Have a better estimation for the initial buffer length
+	  Term* term = getTerm(false); // no need to increase ref count
+	  if ( field==NULL || _tcscmp(term->field(),field)!=0 ) {
+		  buffer.append(term->field());
+		  buffer.appendChar( _T(':'));
+	  }
+	  buffer.append(term->text());
+	  buffer.appendChar( _T('~') );
+	  buffer.appendFloat(minimumSimilarity,1);
+	  buffer.appendBoost(getBoost());
+	  return buffer.getBuffer();
+  }
+
   const char* FuzzyQuery::getObjectName() const{
-  //Func - Returns the name of the query
-  //Pre  - true
-  //post - The string FuzzyQuery has been returned
+	  //Func - Returns the name of the query
+	  //Pre  - true
+	  //post - The string FuzzyQuery has been returned
 
-     return getClassName();
+	  return getClassName();
   }
   const char* FuzzyQuery::getClassName(){
-  //Func - Returns the name of the query
-  //Pre  - true
-  //post - The string FuzzyQuery has been returned
+	  //Func - Returns the name of the query
+	  //Pre  - true
+	  //post - The string FuzzyQuery has been returned
 
-     return "FuzzyQuery";
+	  return "FuzzyQuery";
   }
 
-
-  /**
-   * Returns the minimum similarity that is required for this query to match.
-   * @return float value between 0.0 and 1.0
-   */
-  float_t FuzzyQuery::getMinSimilarity() const {
-    return minimumSimilarity;
-  }
-
   FuzzyQuery::FuzzyQuery(const FuzzyQuery& clone):
-		MultiTermQuery(clone)
-	{
+  MultiTermQuery(clone)
+  {
 	  this->minimumSimilarity = clone.getMinSimilarity();
 	  this->prefixLength = clone.getPrefixLength();
-    
-		//if(prefixLength < 0)
-		//	_CLTHROWA(CL_ERR_IllegalArgument,"prefixLength < 0");
-		//else 
-		if(prefixLength >= clone.getTerm()->textLength())
-			_CLTHROWA(CL_ERR_IllegalArgument,"prefixLength >= term.textLength()");
 
-	}
+	  //if(prefixLength < 0)
+	  //	_CLTHROWA(CL_ERR_IllegalArgument,"prefixLength < 0");
+	  //else 
+	  if(prefixLength >= clone.getTerm()->textLength())
+		  _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength >= term.textLength()");
 
+  }
+
   Query* FuzzyQuery::clone() const{
-		return _CLNEW FuzzyQuery(*this);
-	}
-	size_t FuzzyQuery::hashCode() const{
-		//todo: we should give the query a seeding value... but
-		//need to do it for all hascode functions
-		size_t val = Similarity::floatToByte(getBoost()) ^ getTerm()->hashCode();
-		val ^= Similarity::floatToByte(this->getMinSimilarity());
-		val ^= this->getPrefixLength();
-		return val;
-	}
-	bool FuzzyQuery::equals(Query* other) const{
-		if (!(other->instanceOf(FuzzyQuery::getClassName())))
-			return false;
+	  return _CLNEW FuzzyQuery(*this);
+  }
+  size_t FuzzyQuery::hashCode() const{
+	  //todo: we should give the query a seeding value... but
+	  //need to do it for all hascode functions
+	  // TODO: does not conform with JL
+	  size_t val = Similarity::floatToByte(getBoost()) ^ getTerm()->hashCode();
+	  val ^= Similarity::floatToByte(this->getMinSimilarity());
+	  val ^= this->getPrefixLength();
+	  return val;
+  }
+  bool FuzzyQuery::equals(Query* other) const{
+	  if (this == other) return true;
+	  if (!(other->instanceOf(FuzzyQuery::getClassName())))
+		  return false;
 
-		FuzzyQuery* fq = (FuzzyQuery*)other;
-		return (this->getBoost() == fq->getBoost())
-			&& this->getMinSimilarity() == fq->getMinSimilarity()
-			&& this->getPrefixLength() == fq->getPrefixLength()
-			&& getTerm()->equals(fq->getTerm());
-	}
-    
-  /**
-   * Returns the prefix length, i.e. the number of characters at the start
-   * of a term that must be identical (not fuzzy) to the query term if the query
-   * is to match that term. 
-   */
-  size_t FuzzyQuery::getPrefixLength() const {
-    return prefixLength;
+	  FuzzyQuery* fq = static_cast<FuzzyQuery*>(other);
+	  return (this->getBoost() == fq->getBoost())
+		  && this->minimumSimilarity == fq->getMinSimilarity()
+		  && this->prefixLength == fq->getPrefixLength()
+		  && getTerm()->equals(fq->getTerm());
   }
-
+  
   FilteredTermEnum* FuzzyQuery::getEnum(IndexReader* reader){
 	  Term* term = getTerm(false);
 	  FuzzyTermEnum* ret = _CLNEW FuzzyTermEnum(reader, term, minimumSimilarity, prefixLength);
 	  return ret;
   }
 
-  /*
   Query* FuzzyQuery::rewrite(IndexReader* reader) {
 	  FilteredTermEnum* enumerator = getEnum(reader);
-	  int32_t maxClauseCount = BooleanQuery::getMaxClauseCount();
+	  const int32_t maxClauseCount = BooleanQuery::getMaxClauseCount();
 	  ScoreTermQueue* stQueue = _CLNEW ScoreTermQueue(maxClauseCount);
 	  ScoreTerm* reusableST = NULL;
 
@@ -375,7 +468,7 @@
 		  do {
 			  float_t score = 0.0f;
 			  Term* t = enumerator->term();
-			  if (t != null) {
+			  if (t != NULL) {
 				  score = enumerator->difference();
 				  if (reusableST == NULL) {
 					  reusableST = _CLNEW ScoreTerm(t, score);
@@ -389,27 +482,28 @@
 					  continue;
 				  }
 
-				  reusableST = (ScoreTerm) stQueue->insertWithOverflow(reusableST);
+				  reusableST = stQueue->insertWithOverflow(reusableST);
 			  }
 		  } while (enumerator->next());
 	  } _CLFINALLY({
 		  enumerator->close();
-		  _CLDELETE(enumerator);
-	  }
+		  _CLLDELETE(enumerator);
+	  })
 
-	  BooleanQuery query = _CLNEW BooleanQuery(true);
-	  int size = stQueue->size();
-	  for(int i = 0; i < size; i++){
-		ScoreTerm* st = (ScoreTerm) stQueue->pop();
-		TermQuery* tq = new TermQuery(st.term);      // found a match
-		tq->setBoost(getBoost() * st.score); // set the boost
-		query->add(tq, BooleanClause.Occur.SHOULD);          // add to query
+	  BooleanQuery* query = _CLNEW BooleanQuery(true);
+	  const size_t size = stQueue->size();
+	  for(size_t i = 0; i < size; i++){
+		  ScoreTerm* st = stQueue->pop();
+		  TermQuery* tq = _CLNEW TermQuery(st->term);      // found a match
+		  tq->setBoost(getBoost() * st->score); // set the boost
+		  query->add(tq, BooleanClause::SHOULD);          // add to query
 	  }
+	  _CLLDELETE(stQueue);
 
-	  _CLDELETE(reusableST);
+	  //_CLDELETE(reusableST);
 
 	  return query;
-	  }*/
+  }
 
 
 CL_NS_END

Modified: branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.h	2009-07-08 09:53:49 UTC (rev 3012)
+++ branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.h	2009-07-08 09:54:46 UTC (rev 3013)
@@ -7,28 +7,30 @@
 #ifndef _lucene_search_FuzzyQuery_
 #define _lucene_search_FuzzyQuery_
 
-
-//#include "CLucene/index/IndexReader.h"
-CL_CLASS_DEF(index,Term)
-//#include "MultiTermQuery.h"
 #include "MultiTermQuery.h"
 #include "FilteredTermEnum.h"
 
+CL_CLASS_DEF(index,Term)
 
 CL_NS_DEF(search)
 
-  // class FuzzyQuery implements the fuzzy search query
-  class CLUCENE_EXPORT FuzzyQuery: public MultiTermQuery {
-    private:
-	  float_t minimumSimilarity;
-	  size_t prefixLength;
-  protected:
-	  FuzzyQuery(const FuzzyQuery& clone);
-   public:
-	  static float_t defaultMinSimilarity;
-	  static int32_t defaultPrefixLength;
+/** Implements the fuzzy search query. The similiarity measurement
+* is based on the Levenshtein (edit distance) algorithm.
+*/
+class CLUCENE_EXPORT FuzzyQuery: public MultiTermQuery {
+private:
+	class ScoreTerm;
+	class ScoreTermQueue;
 
-     /**
+	float_t minimumSimilarity;
+	size_t prefixLength;
+protected:
+	FuzzyQuery(const FuzzyQuery& clone);
+public:
+	static float_t defaultMinSimilarity;
+	static int32_t defaultPrefixLength;
+
+	/**
 	* Create a new FuzzyQuery that will match terms with a similarity 
 	* of at least <code>minimumSimilarity</code> to <code>term</code>.
 	* If a <code>prefixLength</code> &gt; 0 is specified, a common prefix
@@ -44,115 +46,192 @@
 	* @throws IllegalArgumentException if minimumSimilarity is &gt; 1 or &lt; 0
 	* or if prefixLength &lt; 0 or &gt; <code>term.text().length()</code>.
 	*/
-     FuzzyQuery(CL_NS(index)::Term* term, float_t minimumSimilarity=-1, size_t prefixLength=0);
-	 //Destructor
-     ~FuzzyQuery();
+	FuzzyQuery(CL_NS(index)::Term* term, float_t minimumSimilarity=-1, size_t prefixLength=0);
+	virtual ~FuzzyQuery();
 
-     TCHAR* toString(const TCHAR* field) const;
+	/**
+	* Returns the minimum similarity that is required for this query to match.
+	* @return float value between 0.0 and 1.0
+	*/
+	float_t getMinSimilarity() const;
 
-	  //Returns the name "FuzzyQuery"
-	  static const char* getClassName();
-    const char* getObjectName() const;
+	/**
+	* Returns the prefix length, i.e. the number of characters at the start
+	* of a term that must be identical (not fuzzy) to the query term if the query
+	* is to match that term. 
+	*/
+	size_t getPrefixLength() const;
 
-	  Query* clone() const;
-	  bool equals(Query * other) const;
-	  size_t hashCode() const;
+	Query* rewrite(CL_NS(index)::IndexReader* reader);
 
-	  /**
-		* Returns the minimum similarity that is required for this query to match.
-		* @return float value between 0.0 and 1.0
-		*/
-		float_t getMinSimilarity() const;
+	TCHAR* toString(const TCHAR* field) const;
 
-		/**
-		* Returns the prefix length, i.e. the number of characters at the start
-		* of a term that must be identical (not fuzzy) to the query term if the query
-		* is to match that term. 
-		*/
-		size_t getPrefixLength() const;
+	//Returns the name "FuzzyQuery"
+	static const char* getClassName();
+	const char* getObjectName() const;
 
-		//Query* FuzzyQuery::rewrite(IndexReader* reader)
+	Query* clone() const;
+	bool equals(Query * other) const;
+	size_t hashCode() const;
 
-  protected:
-	  FilteredTermEnum* getEnum(CL_NS(index)::IndexReader* reader);
-  };
+protected:
+	FilteredTermEnum* getEnum(CL_NS(index)::IndexReader* reader);
+};
 
 /** Subclass of FilteredTermEnum for enumerating all terms that are similiar
- * to the specified filter term.
- *
- * <p>Term enumerations are always ordered by Term.compareTo().  Each term in
- * the enumeration is greater than all that precede it.
- */
+* to the specified filter term.
+*
+* <p>Term enumerations are always ordered by Term.compareTo().  Each term in
+* the enumeration is greater than all that precede it.
+*/
 class CLUCENE_EXPORT FuzzyTermEnum: public FilteredTermEnum {
-  private:
-		float_t distance;
-		bool _endEnum;
+private:
+	/* Allows us save time required to create a new array
+	* everytime similarity is called.
+	*/
+	int32_t* d;
+	size_t dWidth;
+	size_t dHeight;
 
-		CL_NS(index)::Term* searchTerm; 
-		TCHAR* text;
-		size_t textLen;
-		TCHAR* prefix;
-		size_t prefixLength;
-		float_t minimumSimilarity;
-		double scale_factor;
+	//float_t distance;
+	float_t _similarity;
+	bool _endEnum;
 
-		
-		/**
-		* This static array saves us from the time required to create a new array
-		* everytime editDistance is called.
-		*/
-		int32_t* e;
-		int32_t eWidth;
-		int32_t eHeight;
+	CL_NS(index)::Term* searchTerm; 
+	//String field;
+	TCHAR* text;
+	size_t textLen;
+	TCHAR* prefix;
+	size_t prefixLength;
 
-		/******************************
-		* Compute Levenshtein distance
-		******************************/
- 
-		/**
-		Levenshtein distance also known as edit distance is a measure of similiarity
-		between two strings where the distance is measured as the number of character 
-		deletions, insertions or substitutions required to transform one string to 
-		the other string. 
-		<p>This method takes in four parameters; two strings and their respective 
-		lengths to compute the Levenshtein distance between the two strings.
-		The result is returned as an integer.
-		*/ 
-		int32_t editDistance(const TCHAR* s, const TCHAR* t, const int32_t n, const int32_t m) ;
+	float_t minimumSimilarity;
+	double scale_factor;
+	int32_t maxDistances[LUCENE_TYPICAL_LONGEST_WORD_IN_INDEX];
 
-    protected:
-		/**
-		* The termCompare method in FuzzyTermEnum uses Levenshtein distance to 
-		* calculate the distance between the given term and the comparing term. 
-		*/
-		bool termCompare(CL_NS(index)::Term* term) ;
-		
-		///Returns the fact if the current term in the enumeration has reached the end
-		bool endEnum();
-    public:
-		
-		/**
-		* Empty prefix and minSimilarity of 0.5f are used.
-		* 
-		* @param reader
-		* @param term
-		* @throws IOException
-		* @see #FuzzyTermEnum(IndexReader, Term, float_t, int32_t)
-		*/
-		FuzzyTermEnum(CL_NS(index)::IndexReader* reader, CL_NS(index)::Term* term, float_t minSimilarity=FuzzyQuery::defaultMinSimilarity, size_t prefixLength=0);
-		/** Destructor */
-		~FuzzyTermEnum();
-		/** Close the enumeration */
-		void close();
-		
-		/** Returns the difference between the distance and the fuzzy threshold
-		*  multiplied by the scale factor
-		*/
-		float_t difference();
 
-		
-		const char* getObjectName() const;
-		static const char* getClassName();
-  };
+
+	/* LEGACY:
+	int32_t* e;
+	int32_t eWidth;
+	int32_t eHeight;
+	**
+	Levenshtein distance also known as edit distance is a measure of similiarity
+	between two strings where the distance is measured as the number of character 
+	deletions, insertions or substitutions required to transform one string to 
+	the other string. 
+	<p>This method takes in four parameters; two strings and their respective 
+	lengths to compute the Levenshtein distance between the two strings.
+	The result is returned as an integer.
+	*
+	int32_t editDistance(const TCHAR* s, const TCHAR* t, const int32_t n, const int32_t m);
+	*/
+
+	/******************************
+	* Compute Levenshtein distance
+	******************************/
+
+	/**
+	* <p>Similarity returns a number that is 1.0f or less (including negative numbers)
+	* based on how similar the Term is compared to a target term.  It returns
+	* exactly 0.0f when
+	* <pre>
+	*    editDistance &lt; maximumEditDistance</pre>
+	* Otherwise it returns:
+	* <pre>
+	*    1 - (editDistance / length)</pre>
+	* where length is the length of the shortest term (text or target) including a
+	* prefix that are identical and editDistance is the Levenshtein distance for
+	* the two words.</p>
+	*
+	* <p>Embedded within this algorithm is a fail-fast Levenshtein distance
+	* algorithm.  The fail-fast algorithm differs from the standard Levenshtein
+	* distance algorithm in that it is aborted if it is discovered that the
+	* mimimum distance between the words is greater than some threshold.
+	*
+	* <p>To calculate the maximum distance threshold we use the following formula:
+	* <pre>
+	*     (1 - minimumSimilarity) * length</pre>
+	* where length is the shortest term including any prefix that is not part of the
+	* similarity comparision.  This formula was derived by solving for what maximum value
+	* of distance returns false for the following statements:
+	* <pre>
+	*   similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
+	*   return (similarity > minimumSimilarity);</pre>
+	* where distance is the Levenshtein distance for the two words.
+	* </p>
+	* <p>Levenshtein distance (also known as edit distance) is a measure of similiarity
+	* between two strings where the distance is measured as the number of character
+	* deletions, insertions or substitutions required to transform one string to
+	* the other string.
+	* @param target the target word or phrase
+	* @return the similarity,  0.0 or less indicates that it matches less than the required
+	* threshold and 1.0 indicates that the text and target are identical
+	*/
+	float_t similarity(const TCHAR* target, const size_t targetLen);
+
+	/**
+	* Grow the second dimension of the array, so that we can calculate the
+	* Levenshtein difference.
+	*/
+	/*
+	void growDistanceArray(int32_t m) {
+		for (int i = 0; i < d.length; i++) {
+			d[i] = new int[m+1];
+		}
+	}*/
+
+	/**
+	* The max Distance is the maximum Levenshtein distance for the text
+	* compared to some other value that results in score that is
+	* better than the minimum similarity.
+	* @param m the length of the "other value"
+	* @return the maximum levenshtein distance that we care about
+	*/
+	int32_t getMaxDistance(const size_t m);
+
+	void initializeMaxDistances();
+
+	int32_t calculateMaxDistance(const size_t m) const;
+
+protected:
+	/**
+	* The termCompare method in FuzzyTermEnum uses Levenshtein distance to 
+	* calculate the distance between the given term and the comparing term. 
+	*/
+	bool termCompare(CL_NS(index)::Term* term) ;
+
+	/** Returns the fact if the current term in the enumeration has reached the end */
+	bool endEnum();
+public:
+
+	/**
+	* Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
+	* length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity &gt;
+	* <code>minSimilarity</code>.
+	* <p>
+	* After calling the constructor the enumeration is already pointing to the first 
+	* valid term if such a term exists. 
+	* 
+	* @param reader Delivers terms.
+	* @param term Pattern term.
+	* @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f.
+	* @param prefixLength Length of required common prefix. Default value is 0.
+	* @throws IOException
+	*/
+	FuzzyTermEnum(CL_NS(index)::IndexReader* reader, CL_NS(index)::Term* term, float_t minSimilarity=FuzzyQuery::defaultMinSimilarity, size_t prefixLength=0);
+	virtual ~FuzzyTermEnum();
+
+	/** Close the enumeration */
+	void close();
+
+	/** Returns the difference between the distance and the fuzzy threshold
+	*  multiplied by the scale factor
+	*/
+	float_t difference();
+
+	const char* getObjectName() const;
+	static const char* getClassName();
+};
+
 CL_NS_END
 #endif

Modified: branches/lucene2_3_2/src/core/CLucene/search/_PhraseQueue.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/search/_PhraseQueue.h	2009-07-08 09:53:49 UTC (rev 3012)
+++ branches/lucene2_3_2/src/core/CLucene/search/_PhraseQueue.h	2009-07-08 09:54:46 UTC (rev 3013)
@@ -18,7 +18,7 @@
 		PhraseQueue(const int32_t size) {
 			initialize(size,false);
 		}
-		~PhraseQueue(){
+		virtual ~PhraseQueue(){
 		}
 
 	protected:

Modified: branches/lucene2_3_2/src/core/CLucene/util/PriorityQueue.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/util/PriorityQueue.h	2009-07-08 09:53:49 UTC (rev 3012)
+++ branches/lucene2_3_2/src/core/CLucene/util/PriorityQueue.h	2009-07-08 09:54:46 UTC (rev 3013)
@@ -54,11 +54,7 @@
 		}
 
 	protected:
-		PriorityQueue(){
-			this->_size = 0;
-			this->dk = false;
-			this->heap = NULL;
-            this->maxSize = 0;
+		PriorityQueue():_size(0),dk(false),maxSize(0),heap(NULL){
 		}
 
 		// Determines the ordering of objects in this priority queue.  Subclasses

Modified: branches/lucene2_3_2/src/core/files_list.txt
===================================================================
--- branches/lucene2_3_2/src/core/files_list.txt	2009-07-08 09:53:49 UTC (rev 3012)
+++ branches/lucene2_3_2/src/core/files_list.txt	2009-07-08 09:54:46 UTC (rev 3013)
@@ -39,7 +39,7 @@
 
 DONE ISH	queryParser\MultiFieldQueryParser.java	- Some tests are missing
 DONE ISH	queryParser\ParseException.java		- Done, integrated within QueryParser as functions (no special Exception class required)
-DONE ISH	queryParser\QueryParser.java		- Missing Locale and Calendar support (for RangeQuery), ConstantScoreRangeQuery, MultiPhraseQuery, and some tests. _tcstod.
+DONE ISH	queryParser\QueryParser.java		- Missing Locale and Calendar support (for RangeQuery), ConstantScoreRangeQuery, and some tests. _tcstod.
 IRRELEVANT	queryParser\QueryParser.jj
 DONE ISH	queryParser\QueryParserConstants.java
 DONE ISH	queryParser\QueryParserTokenManager.java	- PrintStream implementation is missing (if at all necessary)
@@ -64,12 +64,12 @@
 ?	search\FieldDoc.java
 ?	search\FieldDocSortedHitQueue.java
 ?	search\FieldSortedHitQueue.java
-DONE ISH	search\Filter.java						- Remove virtual toString once CachingWrapperFilter and ChainedFilter (does not exist in JL?) conform to JL
+DONE ISH	search\Filter.java				- Remove virtual toString once CachingWrapperFilter and ChainedFilter (does not exist in JL?) conform to JL
 ?	search\FilteredQuery.java
-?	search\FilteredTermEnum.java
+DONE ISH	search\FilteredTermEnum.java		- Can we mend term(void) and term(bool) ?
 ?	search\FilterManager.java
-?	search\FuzzyQuery.java
-?	search\FuzzyTermEnum.java
+DONE ISH	search\FuzzyQuery.java			- See TODOs.
+DONE ISH	search\FuzzyTermEnum.java			- See TODOs. Also, Old similarity code is commented out and marked as "legacy". It looks like some optimizations were made there, but since Fuzzy queries weren't working as they should we had to revert to Java's implementation. Perhaps after tests are complete we could try and get the optimized version to work again.
 ?	search\Hit.java
 ?	search\HitCollector.java
 ?	search\HitIterator.java
@@ -107,6 +107,7 @@
 get rid of Misc.h, repl_*
 check up on sub-folders bug (analysis/standard)
 MapViewOfFile issues (cmake not picking up functions in kernel32)
+Use safe CRT where possible. For example, make _tcsdup / stringDuplicate require n and call the _s version if cmake realizes it exists
 
 Misc TODOs:
 Update jstreams from latest code of Strigi


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.