From: <ust...@us...> - 2009-09-27 08:58:46
|
Revision: 3022 http://clucene.svn.sourceforge.net/clucene/?rev=3022&view=rev Author: ustramooner Date: 2009-09-27 08:58:40 +0000 (Sun, 27 Sep 2009) Log Message: ----------- cleanup and fixed a bug in FuzzyTermEnum Modified Paths: -------------- branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.cpp branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.h Modified: branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.cpp =================================================================== --- branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.cpp 2009-09-27 08:58:03 UTC (rev 3021) +++ branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.cpp 2009-09-27 08:58:40 UTC (rev 3022) @@ -28,7 +28,7 @@ FuzzyTermEnum::FuzzyTermEnum(IndexReader* reader, Term* term, float_t minSimilarity, size_t _prefixLength): - FilteredTermEnum(),d(NULL),dWidth(0),dHeight(0),_similarity(0),_endEnum(false),searchTerm(_CL_POINTER(term)), + FilteredTermEnum(),d(NULL),_similarity(0),_endEnum(false),searchTerm(_CL_POINTER(term)), text(NULL),textLen(0),prefix(NULL)/* ISH: was STRDUP_TtoT(LUCENE_BLANK_STRING)*/,prefixLength(_prefixLength), minimumSimilarity(minSimilarity) { @@ -58,8 +58,6 @@ prefix[realPrefixLength]='\0'; initializeMaxDistances(); - dWidth = LUCENE_TYPICAL_LONGEST_WORD_IN_INDEX; // default length of the d array - dHeight = textLen + 1; Term* trm = _CLNEW Term(searchTerm->field(), prefix); // _CLNEW Term(term, prefix); -- not intern'd? setEnum(reader->terms(trm)); @@ -134,13 +132,6 @@ const size_t targetLen = termTextLen-prefixLength; _similarity = similarity(target, targetLen); return (_similarity > minimumSimilarity); - - /* LEGACY: - //Calculate the Levenshtein distance - int32_t dist = editDistance(text, target, textLen, targetLen); - distance = 1 - ((float_t)dist / (float_t)cl_min(textLen, targetLen)); - return (distance > minimumSimilarity); - */ } _endEnum = true; return false; @@ -177,23 +168,20 @@ //let's make sure we have enough room in our array to do the distance calculations. //Check if the array must be reallocated because it is too small or does not exist - - // TODO: realloc should be able to allocate memory for NULL pointers; if thats the case the NULL - // check here is redundant - if (d == NULL){ - dWidth = cl_max(dWidth, n+1); - dHeight = cl_max(dHeight, m+1); - d = reinterpret_cast<int32_t*>(malloc(sizeof(int32_t)*dWidth*dHeight)); - } else if (dWidth <= n || dHeight <= m) { - //growDistanceArray - dWidth = cl_max(dWidth, n+1); - dHeight = cl_max(dHeight, m+1); - d = reinterpret_cast<int32_t*>(realloc(d, sizeof(int32_t)*dWidth*dHeight)); + size_t dWidth = n+1; + size_t dHeight = m+1; + if (d == NULL){ + dLen = dWidth*dHeight; + d = (int32_t*)(malloc(sizeof(int32_t)*dLen)); + } else if (dLen < dWidth*dHeight) { + dLen = dWidth*dHeight; + d = (int32_t*)(realloc(d, sizeof(int32_t)*dLen)); } + memset(d,0,dLen); + + size_t i; // iterates through the source string + size_t j; // iterates through the target string - size_t i; // iterates through the source string - size_t j; // iterates through the target string - // init matrix d for (i = 0; i <= n; i++){ d[i + (0*dWidth)] = i; @@ -254,72 +242,6 @@ return (int32_t) ((1-minimumSimilarity) * (cl_min(textLen, m) + prefixLength)); } - /* LEGACY: - int32_t FuzzyTermEnum::editDistance(const TCHAR* s, const TCHAR* t, const int32_t n, const int32_t m) { - //Func - Calculates the Levenshtein distance also known as edit distance is a measure of similiarity - // between two strings where the distance is measured as the number of character - // deletions, insertions or substitutions required to transform one string to - // the other string. - //Pre - s != NULL and contains the source string - // t != NULL and contains the target string - // n >= 0 and contains the length of the source string - // m >= 0 and containts the length of the target string - //Post - The distance has been returned - - CND_PRECONDITION(s != NULL, "s is NULL"); - CND_PRECONDITION(t != NULL, "t is NULL"); - CND_PRECONDITION(n >= 0," n is a negative number"); - CND_PRECONDITION(n >= 0," n is a negative number"); - - int32_t i; // iterates through s - int32_t j; // iterates through t - TCHAR s_i; // ith character of s - - if (n == 0) - return m; - if (m == 0) - return n; - - //Check if the array must be reallocated because it is too small or does not exist - if (e == NULL || eWidth <= n || eHeight <= m) { - //Delete e if possible - _CLDELETE_ARRAY(e); - //resize e - eWidth = cl_max(eWidth, n+1); - eHeight = cl_max(eHeight, m+1); - e = _CL_NEWARRAY(int32_t,eWidth*eHeight); - } - - CND_CONDITION(e != NULL,"e is NULL"); - - // init matrix e - for (i = 0; i <= n; i++){ - e[i + (0*eWidth)] = i; - } - for (j = 0; j <= m; j++){ - e[0 + (j*eWidth)] = j; - } - - int32_t __t; //temporary variable for min3 - - // start computing edit distance - for (i = 1; i <= n; i++) { - s_i = s[i - 1]; - for (j = 1; j <= m; j++) { - if (s_i != t[j-1]){ - min3(e[i + (j*eWidth) - 1], e[i + ((j-1)*eWidth)], e[i + ((j-1)*eWidth)-1]); - e[i + (j*eWidth)] = __t+1; - }else{ - min3(e[i + (j*eWidth) -1]+1, e[i + ((j-1)*eWidth)]+1, e[i + ((j-1)*eWidth)-1]); - e[i + (j*eWidth)] = __t; - } - } - } - - // we got the result! - return e[n + ((m)*eWidth)]; - }*/ - class FuzzyQuery::ScoreTerm { public: Term* term; Modified: branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.h =================================================================== --- branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.h 2009-09-27 08:58:03 UTC (rev 3021) +++ branches/lucene2_3_2/src/core/CLucene/search/FuzzyQuery.h 2009-09-27 08:58:40 UTC (rev 3022) @@ -90,8 +90,7 @@ * everytime similarity is called. */ int32_t* d; - size_t dWidth; - size_t dHeight; + size_t dLen; //float_t distance; float_t _similarity; @@ -108,24 +107,6 @@ double scale_factor; int32_t maxDistances[LUCENE_TYPICAL_LONGEST_WORD_IN_INDEX]; - - - /* LEGACY: - int32_t* e; - int32_t eWidth; - int32_t eHeight; - ** - Levenshtein distance also known as edit distance is a measure of similiarity - between two strings where the distance is measured as the number of character - deletions, insertions or substitutions required to transform one string to - the other string. - <p>This method takes in four parameters; two strings and their respective - lengths to compute the Levenshtein distance between the two strings. - The result is returned as an integer. - * - int32_t editDistance(const TCHAR* s, const TCHAR* t, const int32_t n, const int32_t m); - */ - /****************************** * Compute Levenshtein distance ******************************/ @@ -170,17 +151,6 @@ float_t similarity(const TCHAR* target, const size_t targetLen); /** - * Grow the second dimension of the array, so that we can calculate the - * Levenshtein difference. - */ - /* - void growDistanceArray(int32_t m) { - for (int i = 0; i < d.length; i++) { - d[i] = new int[m+1]; - } - }*/ - - /** * The max Distance is the maximum Levenshtein distance for the text * compared to some other value that results in score that is * better than the minimum similarity. This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |