From: <den...@us...> - 2010-01-19 15:15:52
|
Revision: 48 http://dsim.svn.sourceforge.net/dsim/?rev=48&view=rev Author: denis_arnaud Date: 2010-01-19 15:15:45 +0000 (Tue, 19 Jan 2010) Log Message: ----------- [Test] Added the original C version of the Levenshtein distance test program. Added Paths: ----------- trunk/dsim/test/boost/spirit/levenshtein.c Added: trunk/dsim/test/boost/spirit/levenshtein.c =================================================================== --- trunk/dsim/test/boost/spirit/levenshtein.c (rev 0) +++ trunk/dsim/test/boost/spirit/levenshtein.c 2010-01-19 15:15:45 UTC (rev 48) @@ -0,0 +1,168 @@ +/* + Levenshtein.c v2003-05-10 + Python extension computing Levenshtein distances, string similarities, + median strings and other goodies. + + Copyright (C) 2002-2003 David Necas (Yeti) <ye...@ph...>. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +*/ + +/* + * Levenshtein distance between string1 and string2. + * + * Replace cost is normally 1, and 2 with nonzero xcost. + */ +size_t lev_distance (size_t len1, const unsigned char* string1, + size_t len2, const unsigned char* string2, + size_t xcost) { + size_t i; + size_t *row; /* we only need to keep one row of costs */ + size_t *end; + size_t half; + + /* strip common prefix */ + while (len1 > 0 && len2 > 0 && *string1 == *string2) { + len1--; + len2--; + string1++; + string2++; + } + + /* strip common suffix */ + while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) { + len1--; + len2--; + } + + /* catch trivial cases */ + if (len1 == 0) + return len2; + if (len2 == 0) + return len1; + + /* make the inner cycle (i.e. string2) the longer one */ + if (len1 > len2) { + size_t nx = len1; + const unsigned char *sx = string1; + len1 = len2; + len2 = nx; + string1 = string2; + string2 = sx; + } + /* check len1 == 1 separately */ + if (len1 == 1) { + if (xcost) + return len2 + 1 - 2*(memchr(string2, *string1, len2) != NULL); + else + return len2 - (memchr(string2, *string1, len2) != NULL); + } + len1++; + len2++; + half = len1 >> 1; + + /* initalize first row */ + row = (size_t*)malloc(len2*sizeof(size_t)); + if (!row) + return (size_t)(-1); + end = row + len2 - 1; + for (i = 0; i < len2 - (xcost ? 0 : half); i++) + row[i] = i; + + /* go through the matrix and compute the costs. yes, this is an extremely + * obfuscated version, but also extremely memory-conservative and + * relatively fast. + */ + if (xcost) { + for (i = 1; i < len1; i++) { + size_t *p = row + 1; + const unsigned char char1 = string1[i - 1]; + const unsigned char *char2p = string2; + size_t D = i; + size_t x = i; + while (p <= end) { + if (char1 == *(char2p++)) + x = --D; + else + x++; + D = *p; + D++; + if (x > D) + x = D; + *(p++) = x; + } + } + } + else { + /* in this case we don't have to scan two corner triangles (of size len1/2) + * in the matrix because no best path can go throught them. note this + * breaks when len1 == len2 == 2 so the memchr() special case above is + * necessary */ + row[0] = len1 - half - 1; + for (i = 1; i < len1; i++) { + size_t *p; + const unsigned char char1 = string1[i - 1]; + const unsigned char *char2p; + size_t D, x; + /* skip the upper triangle */ + if (i >= len1 - half) { + size_t offset = i - (len1 - half); + size_t c3; + + char2p = string2 + offset; + p = row + offset; + c3 = *(p++) + (char1 != *(char2p++)); + x = *p; + x++; + D = x; + if (x > c3) + x = c3; + *(p++) = x; + } + else { + p = row + 1; + char2p = string2; + D = x = i; + } + /* skip the lower triangle */ + if (i <= half + 1) + end = row + len2 + i - half - 2; + /* main */ + while (p <= end) { + size_t c3 = --D + (char1 != *(char2p++)); + x++; + if (x > c3) + x = c3; + D = *p; + D++; + if (x > D) + x = D; + *(p++) = x; + } + /* lower triangle sentinel */ + if (i <= half) { + size_t c3 = --D + (char1 != *char2p); + x++; + if (x > c3) + x = c3; + *p = x; + } + } + } + + i = *end; + free(row); + return i; +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |