From: <fp...@us...> - 2008-11-14 17:16:24
|
Revision: 5437 http://octave.svn.sourceforge.net/octave/?rev=5437&view=rev Author: fpoto Date: 2008-11-14 17:16:21 +0000 (Fri, 14 Nov 2008) Log Message: ----------- Corrected the "complete" method, which now gives correct results. Marked the "median" method as unimplemented, as it gave wrone results. Improved the help string. Added tests for the "single" and "complete" methods. Updated license to GPL 3 or later. Modified Paths: -------------- trunk/octave-forge/main/statistics/inst/linkage.m Modified: trunk/octave-forge/main/statistics/inst/linkage.m =================================================================== --- trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-14 12:32:56 UTC (rev 5436) +++ trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-14 17:16:21 UTC (rev 5437) @@ -1,8 +1,9 @@ ## Copyright (C) 2006, 2008 Bill Denney <bi...@de...> +## Copyright (C) 2008 Francesco Potort\xEC <po...@gn...> ## ## This software is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by -## the Free Software Foundation; either version 2, or (at your option) +## the Free Software Foundation; either version 3, or (at your option) ## any later version. ## ## This software is distributed in the hope that it will be useful, but @@ -17,6 +18,7 @@ ## -*- texinfo -*- ## @deftypefn {Function File} {@var{y} =} linkage (@var{x}) ## @deftypefnx {Function File} {@var{y} =} linkage (@var{x}, @var{method}) +## ## Return clusters generated from a distance vector created by the pdist ## function. ## @@ -46,13 +48,16 @@ ## ## @end table ## -## @var{x} is an ((m-1)*(m/2) x 1) distance vector as generated by -## pdist, and the output, @var{y} is an ((m - 1) x 3) vector defined -## with columns where the first and second columns are the cluster -## numbers of the two sub-clusters in the cluster, and the third column -## is the distance between those sub-clusters. The sub-clusters are -## numbered where 1 to m are the input elements, and m+1 to the end are -## subsequently defined clusters. +## @var{x} is the dissimilarity matrix relative to @var{n} observations, +## formatted as a @math{(n-1)*n/2}x1 vector as produced by @code{pdist}. +## @code{linkage} starts by putting each observation into a singleton +## cluster and numbering those from 1 to @var{n}. Then it merges two +## clusters to create a new cluster numbered @var{n+1}, and so on +## until all observations are grouped into a single cluster numbered +## @var{2*n-1}. Row @var{m} of the @math{m-1}x3 output matrix relates +## to cluster @math{n+m}: the first two columns are the numbers of the +## two component clusters and column 3 contains their distance. +## ## @seealso{cluster,pdist} ## @end deftypefn @@ -67,63 +72,55 @@ method = "single"; endif - method = lower (method); if (isempty (x)) error ("linkage: x cannot be empty"); elseif (~ isvector (x)) error ("linkage: x must be a vector"); endif - xmat = squareform (x); - sxm = size(xmat, 1); + ## Function findfxn returns a scalar from a vector and a row from a + ## matrix. - if strcmp (method, "single") - ## this is just a minimal spanning tree - y = linker (xmat, @min); - elseif strcmp (method, "complete") - y = linker (xmat, @max); - elseif strcmp (method, "average") - error ("linkage: %s is not yet implemented", method); - elseif strcmp (method, "weighted") - error ("linkage: %s is not yet implemented", method); - elseif strcmp (method, "centroid") - error ("linkage: %s is not yet implemented", method); - elseif strcmp (method, "median") - y = linker (xmat, @median); - elseif strcmp (method, "ward") - error ("linkage: %s is not yet implemented", method); - else - error ("linkage: unrecognised method"); - endif -endfunction + method = lower (method); + switch (method) + case "single" + ## this is just a minimal spanning tree + findfxn = @min; + case "complete" + findfxn = @max; + case { "average", "median", "weighted", "centroid", "ward" } + error ("linkage: %s is not yet implemented", method); + otherwise + error ("linkage: %s: unknown method", method); + endswitch -function y = linker (matrix, findfxn) - ## findfxn should return a scalar from a vector and a row from a - ## matrix - - y = zeros (0,3); - - yidx = 0; - startsize = size (matrix, 1); + dissim = squareform (x, "tomatrix"); + startsize = size (dissim, 1); + y = zeros (startsize - 1, 3); cnameidx = 1:startsize; - while (~ isscalar (matrix)) - yidx++; - sxm = size (matrix, 1); - available = (tril (ones(sxm)) - eye (sxm)); - ## find the next link to make - [r, c] = find (findfxn (matrix(logical (available))) == matrix, 1); - - ## update the results - y(yidx, :) = [cnameidx(r) cnameidx(c) matrix(r, c)]; - ## update the indexes + for yidx = 1:startsize-1 + ## Find the two nearest clusters. + available = logical(tril (ones(size(dissim))) - eye(size(dissim))); + [r, c] = find (min (dissim(available)) == dissim, 1); + ## Here is the new cluster. + y(yidx, :) = [cnameidx(r) cnameidx(c) dissim(r, c)]; + ## Add it as a new cluster index and remove the old ones. cnameidx(r) = yidx + startsize; cnameidx(c) = []; + ## Update the dissimilarity matrix; the diagonal element may be made + ## inconsistent by this, but they are not used. + newdissim = findfxn (dissim([r c], :)); + dissim(r,:) = newdissim; + dissim(:,r) = newdissim'; + dissim(c,:) = []; + dissim(:,c) = []; + endfor - ## update the matrix (the diagonal element may be made inconsistent - ## by this,but that doesn't really matter) - matrix(r,:) = findfxn (matrix([r c], :)); - matrix(:,r) = matrix(r,:)'; - matrix(c,:) = []; - matrix(:,c) = []; - endwhile -endfunction \ No newline at end of file +endfunction + +%!shared xy, t +%! xy = [3 1.7; 1 1; 2 3; 2 2.5; 1.2 1; 1.1 1.5; 3 1]; +%! t = 1e-6; +%!assert (cond (linkage (pdist (xy))), 66.534612, t); +%!assert (cond (linkage (pdist (xy), "single")), 66.534612, t); +%!assert (cond (linkage (pdist (xy), "complete")), 27.071750, t); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <fp...@us...> - 2008-11-14 18:40:38
|
Revision: 5438 http://octave.svn.sourceforge.net/octave/?rev=5438&view=rev Author: fpoto Date: 2008-11-14 18:40:34 +0000 (Fri, 14 Nov 2008) Log Message: ----------- Fixed max: zero diagonal element after cluster merge. Modified Paths: -------------- trunk/octave-forge/main/statistics/inst/linkage.m Modified: trunk/octave-forge/main/statistics/inst/linkage.m =================================================================== --- trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-14 17:16:21 UTC (rev 5437) +++ trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-14 18:40:34 UTC (rev 5438) @@ -32,16 +32,19 @@ ## Furthest distance between two clusters ## ## @item "average" -## Unweighted average distance (aka group average) +## Unweighted pair group method with averaging (UPGMA) ## ## @item "weighted" -## Weighted average distance +## Weighted pair group method with averaging (WPGMA) +## Same as Median, its formal definition does not require Euclidean +## metric. (Is this true?) ## ## @item "centroid" -## Centroid distance (not implemented) +## Centroid distance ## ## @item "median" -## Weighted center of mass distance +## Weighted pair-group method using centroids (WPGMC) +## To be used with Euclidean metric. (Is this true?) ## ## @item "ward" ## Inner squared distance (minimum variance) @@ -78,8 +81,8 @@ error ("linkage: x must be a vector"); endif - ## Function findfxn returns a scalar from a vector and a row from a - ## matrix. + ## Function findfxn must return a scalar from a vector and a row from + ## a matrix. method = lower (method); switch (method) @@ -87,8 +90,11 @@ ## this is just a minimal spanning tree findfxn = @min; case "complete" + error ("linkage: %s is not yet implemented", method); findfxn = @max; - case { "average", "median", "weighted", "centroid", "ward" } + case { "median", "weighted" } + findfxn = @mean; + case { "average", "centroid", "ward" } error ("linkage: %s is not yet implemented", method); otherwise error ("linkage: %s: unknown method", method); @@ -107,11 +113,11 @@ ## Add it as a new cluster index and remove the old ones. cnameidx(r) = yidx + startsize; cnameidx(c) = []; - ## Update the dissimilarity matrix; the diagonal element may be made - ## inconsistent by this, but they are not used. + ## Update the dissimilarity matrix newdissim = findfxn (dissim([r c], :)); dissim(r,:) = newdissim; dissim(:,r) = newdissim'; + dissim(r,r) = 0; dissim(c,:) = []; dissim(:,c) = []; endfor This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <fp...@us...> - 2008-11-16 19:49:13
|
Revision: 5439 http://octave.svn.sourceforge.net/octave/?rev=5439&view=rev Author: fpoto Date: 2008-11-16 19:49:09 +0000 (Sun, 16 Nov 2008) Log Message: ----------- Corrected a typo: now "complete" works. Added more tests. Performance improvements. Modified Paths: -------------- trunk/octave-forge/main/statistics/inst/linkage.m Modified: trunk/octave-forge/main/statistics/inst/linkage.m =================================================================== --- trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-14 18:40:34 UTC (rev 5438) +++ trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-16 19:49:09 UTC (rev 5439) @@ -22,6 +22,17 @@ ## Return clusters generated from a distance vector created by the pdist ## function. ## +## @var{x} is the dissimilarity matrix relative to @var{n} observations, +## formatted as a @math{(n-1)*n/2}x1 vector as produced by @code{pdist}. +## @code{linkage} starts by putting each observation into a singleton +## cluster and numbering those from 1 to @var{n}. Then it merges two +## clusters, chosen according to @var{method}, to create a new cluster +## numbered @var{n+1}, and so on until all observations are grouped into +## a single cluster numbered @var{2*n-1}. Row @var{m} of the +## @math{m-1}x3 output matrix relates to cluster @math{n+m}: the first +## two columns are the numbers of the two component clusters and column +## 3 contains their distance. +## ## Methods can be: ## ## @table @samp @@ -51,16 +62,6 @@ ## ## @end table ## -## @var{x} is the dissimilarity matrix relative to @var{n} observations, -## formatted as a @math{(n-1)*n/2}x1 vector as produced by @code{pdist}. -## @code{linkage} starts by putting each observation into a singleton -## cluster and numbering those from 1 to @var{n}. Then it merges two -## clusters to create a new cluster numbered @var{n+1}, and so on -## until all observations are grouped into a single cluster numbered -## @var{2*n-1}. Row @var{m} of the @math{m-1}x3 output matrix relates -## to cluster @math{n+m}: the first two columns are the numbers of the -## two component clusters and column 3 contains their distance. -## ## @seealso{cluster,pdist} ## @end deftypefn @@ -90,43 +91,46 @@ ## this is just a minimal spanning tree findfxn = @min; case "complete" - error ("linkage: %s is not yet implemented", method); findfxn = @max; - case { "median", "weighted" } - findfxn = @mean; - case { "average", "centroid", "ward" } + case { "median", "weighted", "average", "centroid", "ward" } error ("linkage: %s is not yet implemented", method); otherwise error ("linkage: %s: unknown method", method); endswitch - dissim = squareform (x, "tomatrix"); - startsize = size (dissim, 1); - y = zeros (startsize - 1, 3); - cnameidx = 1:startsize; - for yidx = 1:startsize-1 - ## Find the two nearest clusters. - available = logical(tril (ones(size(dissim))) - eye(size(dissim))); - [r, c] = find (min (dissim(available)) == dissim, 1); - ## Here is the new cluster. - y(yidx, :) = [cnameidx(r) cnameidx(c) dissim(r, c)]; - ## Add it as a new cluster index and remove the old ones. - cnameidx(r) = yidx + startsize; - cnameidx(c) = []; - ## Update the dissimilarity matrix + dissim = squareform (x, "tomatrix"); # dissimilarity matrix in square format + n = rows (dissim); # the number of observations + diagidx = sub2ind ([n,n], 1:n, 1:n); # indices of diagonal elements + dissim(diagidx) = Inf; # consider a cluster as far from itself + cname = 1:n; # cluster names in dissim + y = zeros (n-1, 3); # clusters from n+1 to 2*n-1 + for yidx = 1:n-1 + ## Find the two nearest clusters + [m midx] = min (dissim(:)); # the min distance and its first index + [r, c] = ind2sub (size (dissim), midx); # row and col number + ## Here is the new cluster + y(yidx, :) = [cname(r) cname(c) dissim(r, c)]; + ## Add it as a new cluster index and remove the old ones + cname(r) = yidx + n; + cname(c) = []; + ## Add the new dissimilarities and remove the old ones newdissim = findfxn (dissim([r c], :)); + newdissim(r) = Inf; dissim(r,:) = newdissim; dissim(:,r) = newdissim'; - dissim(r,r) = 0; dissim(c,:) = []; dissim(:,c) = []; endfor endfunction -%!shared xy, t -%! xy = [3 1.7; 1 1; 2 3; 2 2.5; 1.2 1; 1.1 1.5; 3 1]; +%!shared x, y, t +%! x = [3 1.7; 1 1; 2 3; 2 2.5; 1.2 1; 1.1 1.5; 3 1]; +%! y = reshape(mod(magic(6),5),[],3); %! t = 1e-6; -%!assert (cond (linkage (pdist (xy))), 66.534612, t); -%!assert (cond (linkage (pdist (xy), "single")), 66.534612, t); -%!assert (cond (linkage (pdist (xy), "complete")), 27.071750, t); +%!assert (cond (linkage (pdist (x))), 66.534612, t); +%!assert (cond (linkage (pdist (y))), 34.945071, t); +%!assert (cond (linkage (pdist (x), "single")), 66.534612, t); +%!assert (cond (linkage (pdist (y), "single")), 34.945071, t); +%!assert (cond (linkage (pdist (x), "complete")), 27.071750, t); +%!assert (cond (linkage (pdist (y), "complete")), 20.296516, t); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <fp...@us...> - 2008-11-17 12:56:49
|
Revision: 5440 http://octave.svn.sourceforge.net/octave/?rev=5440&view=rev Author: fpoto Date: 2008-11-17 12:56:36 +0000 (Mon, 17 Nov 2008) Log Message: ----------- Documentation improved. Method "median" implemented. Try to mimic Matlab's clustering order (not there yet). Modified Paths: -------------- trunk/octave-forge/main/statistics/inst/linkage.m Modified: trunk/octave-forge/main/statistics/inst/linkage.m =================================================================== --- trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-16 19:49:09 UTC (rev 5439) +++ trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-17 12:56:36 UTC (rev 5440) @@ -33,29 +33,35 @@ ## two columns are the numbers of the two component clusters and column ## 3 contains their distance. ## -## Methods can be: +## Methods define the way the distance between two clusters is computed: ## ## @table @samp ## @item "single" (default) -## Shortest distance between two clusters (aka a minimum spanning tree) +## Distance between two clusters is the minimum distance between two +## elements belonging each to one cluster. Produces a cluster tree +## known as minimum spanning tree. ## ## @item "complete" -## Furthest distance between two clusters +## Furthest distance between two elements belonging each to one cluster. ## ## @item "average" ## Unweighted pair group method with averaging (UPGMA) +## The mean distance between all pair of elements each belonging to one +## cluster. ## ## @item "weighted" ## Weighted pair group method with averaging (WPGMA) -## Same as Median, its formal definition does not require Euclidean -## metric. (Is this true?) ## ## @item "centroid" -## Centroid distance +## Unweighted Pair-Group Method using Centroids (UPGMC) +## Assumes Euclidean metric. The distance between cluster centroids, +## each centroid being the center of mass of a cluster. ## ## @item "median" -## Weighted pair-group method using centroids (WPGMC) -## To be used with Euclidean metric. (Is this true?) +## Weighted pair-group method using centroids (WPGMC). +## Assumes Euclidean metric. Distance between cluster centroids. When +## two clusters are joined together, the new centroid is the midpoint +## between the joined centroids. ## ## @item "ward" ## Inner squared distance (minimum variance) @@ -82,17 +88,18 @@ error ("linkage: x must be a vector"); endif - ## Function findfxn must return a scalar from a vector and a row from + ## Function distance must return a scalar from a vector and a row from ## a matrix. method = lower (method); switch (method) case "single" - ## this is just a minimal spanning tree - findfxn = @min; + dist = @min; case "complete" - findfxn = @max; - case { "median", "weighted", "average", "centroid", "ward" } + dist = @max; + case "median" + dist = @mediandist; # see below + case { "weighted", "average", "centroid", "ward" } error ("linkage: %s is not yet implemented", method); otherwise error ("linkage: %s: unknown method", method); @@ -106,31 +113,54 @@ y = zeros (n-1, 3); # clusters from n+1 to 2*n-1 for yidx = 1:n-1 ## Find the two nearest clusters - [m midx] = min (dissim(:)); # the min distance and its first index - [r, c] = ind2sub (size (dissim), midx); # row and col number + ## For equal-distance nodes, the order in which nodes are added to + ## clusters is arbitrary. The following code chooses the nodes so + ## to mostly get the same ordering as in Matlab. Note that the + ## weighted methods can produce different clusterings depending on + ## the order in which elements are added to clusters. + [r c] = find (dissim == min (dissim(:)), 1, "last"); ## Here is the new cluster y(yidx, :) = [cname(r) cname(c) dissim(r, c)]; - ## Add it as a new cluster index and remove the old ones + ## Put it in place of the first one and remove the second cname(r) = yidx + n; cname(c) = []; - ## Add the new dissimilarities and remove the old ones - newdissim = findfxn (dissim([r c], :)); - newdissim(r) = Inf; - dissim(r,:) = newdissim; - dissim(:,r) = newdissim'; + ## Same for the dissimilarities, take care of the diagonal element + d = dist (dissim([r c], :)); + d(r) = Inf; + dissim(r,:) = d; + dissim(:,r) = d'; dissim(c,:) = []; dissim(:,c) = []; endfor + ## Check that distances are monotonically increasing + if (any (diff (y(:,3)) < 0)) + warning ("clustering", + "linkage: cluster distances do not monotonically increase\n\ + you should maybe use a method different from \"%s\"", method); + endif + endfunction +## Take two row vectors, which are the distances of clusters I and J +## from the others. Column J of second row contains Inf, column J of +## first row contains distance between clusters I and J. The centroid +## of the new cluster is midway between the old ones. Use the law of +## cosines to find distances of the new ones from all the others. +function y = mediandist (x) + interdist = x(1, x(2,:) == Inf); # distance between component clusters + y = sqrt (sumsq (x) / 2 - interdist^2 / 4); +endfunction + %!shared x, y, t %! x = [3 1.7; 1 1; 2 3; 2 2.5; 1.2 1; 1.1 1.5; 3 1]; %! y = reshape(mod(magic(6),5),[],3); %! t = 1e-6; -%!assert (cond (linkage (pdist (x))), 66.534612, t); -%!assert (cond (linkage (pdist (y))), 34.945071, t); -%!assert (cond (linkage (pdist (x), "single")), 66.534612, t); -%!assert (cond (linkage (pdist (y), "single")), 34.945071, t); -%!assert (cond (linkage (pdist (x), "complete")), 27.071750, t); -%!assert (cond (linkage (pdist (y), "complete")), 20.296516, t); +%!assert (cond (linkage (pdist (x))), 56.981591, t); +%!assert (cond (linkage (pdist (y))), 27.771542, t); +%!assert (cond (linkage (pdist (x), "single")), 56.981591, t); +%!assert (cond (linkage (pdist (y), "single")), 27.771542, t); +%!assert (cond (linkage (pdist (x), "complete")), 26.681691, t); +%!assert (cond (linkage (pdist (y), "complete")), 21.726318, t); +%!assert (cond (linkage (pdist (x), "median")), 39.032841, t); +%!assert (cond (linkage (pdist (y), "median")), 26.159331, t); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <fp...@us...> - 2008-11-18 12:19:30
|
Revision: 5442 http://octave.svn.sourceforge.net/octave/?rev=5442&view=rev Author: fpoto Date: 2008-11-18 12:18:48 +0000 (Tue, 18 Nov 2008) Log Message: ----------- For those orders where different possible results are equally legitimate, try to stay nearest to Matlab (even if not completely the same). Implement Unweighted Pair-Group Method using Centroids (UPGMC), that is, the "centroid" method. Modified Paths: -------------- trunk/octave-forge/main/statistics/inst/linkage.m Modified: trunk/octave-forge/main/statistics/inst/linkage.m =================================================================== --- trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-17 17:25:48 UTC (rev 5441) +++ trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-18 12:18:48 UTC (rev 5442) @@ -48,9 +48,11 @@ ## Unweighted pair group method with averaging (UPGMA) ## The mean distance between all pair of elements each belonging to one ## cluster. +## NOT IMPLEMENTED ## ## @item "weighted" ## Weighted pair group method with averaging (WPGMA) +## NOT IMPLEMENTED ## ## @item "centroid" ## Unweighted Pair-Group Method using Centroids (UPGMC) @@ -65,6 +67,7 @@ ## ## @item "ward" ## Inner squared distance (minimum variance) +## NOT IMPLEMENTED ## ## @end table ## @@ -88,51 +91,63 @@ error ("linkage: x must be a vector"); endif - ## Function distance must return a scalar from a vector and a row from - ## a matrix. + quickmethods = { "single", "complete", "centroid", "median" }; + slowmethods = { "weighted", "average", "ward" }; method = lower (method); switch (method) - case "single" - dist = @min; - case "complete" - dist = @max; - case "median" - dist = @mediandist; # see below - case { "weighted", "average", "centroid", "ward" } + + ## These methods do not require to keep memory of merged clusters + case (quickmethods) + distfunction = {(@(x,w) min(x)); (@(x,w) max(x)) + (@(x,w) massdist(x,w)); (@(x,w) massdist(x,1))}; + dist = distfunction {strcmp (method, quickmethods)}; + dissim = squareform (x, "tomatrix"); # dissimilarity NxN matrix + n = rows (dissim); # the number of observations + diagidx = sub2ind ([n,n], 1:n, 1:n); # indices of diagonal elements + dissim(diagidx) = Inf; # consider a cluster as far from itself + cname = 1:n; # cluster names in dissim + weight = ones (1, n); # cluster weights + y = zeros (n-1, 3); # clusters from n+1 to 2*n-1 + for yidx = 1:n-1 + ## Find the two nearest clusters + ## For equal-distance nodes, the order in which nodes are added + ## to clusters is arbitrary. The two commented lines of code + ## are simplest, but the two uncommented ones choose the nodes + ## so to get an ordering closer to Matlab's. Note that the + ## weighted methods can produce different clusterings depending + ## on the order in which elements are added to clusters. + ###[m midx] = min (dissim(:)); + ###[r, c] = ind2sub (size (dissim), midx); + [r c] = find (dissim == min (dissim(:)), 1, "last"); + if (cname(r) > cname(c)) [r c] = swap (r, c); endif + ## Here is the new cluster + y(yidx, :) = [cname(r) cname(c) dissim(r, c)]; + ## Put it in place of the first one and remove the second + cname(r) = yidx + n; + cname(c) = []; + ## Compute the new distances. The dist function must return a + ## scalar from a vector and a row from a matrix. + d = dist (dissim([r c], :), weight(r)/weight(c)); + d(r) = Inf; # take care of the diagonal element + ## Put distances in place of the first ones, remove the second ones + dissim(r,:) = d; + dissim(:,r) = d'; + dissim(c,:) = []; + dissim(:,c) = []; + ## The new weight is the sum of the components' weights + weight(r) += weight(c); + weight(c) = []; + endfor + + ## These methods require a list of elements per merged clusters + case (slowmethods) error ("linkage: %s is not yet implemented", method); + otherwise error ("linkage: %s: unknown method", method); endswitch - dissim = squareform (x, "tomatrix"); # dissimilarity matrix in square format - n = rows (dissim); # the number of observations - diagidx = sub2ind ([n,n], 1:n, 1:n); # indices of diagonal elements - dissim(diagidx) = Inf; # consider a cluster as far from itself - cname = 1:n; # cluster names in dissim - y = zeros (n-1, 3); # clusters from n+1 to 2*n-1 - for yidx = 1:n-1 - ## Find the two nearest clusters - ## For equal-distance nodes, the order in which nodes are added to - ## clusters is arbitrary. The following code chooses the nodes so - ## to mostly get the same ordering as in Matlab. Note that the - ## weighted methods can produce different clusterings depending on - ## the order in which elements are added to clusters. - [r c] = find (dissim == min (dissim(:)), 1, "last"); - ## Here is the new cluster - y(yidx, :) = [cname(r) cname(c) dissim(r, c)]; - ## Put it in place of the first one and remove the second - cname(r) = yidx + n; - cname(c) = []; - ## Same for the dissimilarities, take care of the diagonal element - d = dist (dissim([r c], :)); - d(r) = Inf; - dissim(r,:) = d; - dissim(:,r) = d'; - dissim(c,:) = []; - dissim(:,c) = []; - endfor - ## Check that distances are monotonically increasing if (any (diff (y(:,3)) < 0)) warning ("clustering", @@ -142,25 +157,29 @@ endfunction -## Take two row vectors, which are the distances of clusters I and J -## from the others. Column J of second row contains Inf, column J of -## first row contains distance between clusters I and J. The centroid -## of the new cluster is midway between the old ones. Use the law of -## cosines to find distances of the new ones from all the others. -function y = mediandist (x) - interdist = x(1, x(2,:) == Inf); # distance between component clusters - y = sqrt (sumsq (x) / 2 - interdist^2 / 4); +## Take two row vectors, which are the Euclidean distances of clusters I +## and J from the others. Column J of second row contains Inf, column J +## of first row contains the distance between clusters I and J. The +## centroid of the new cluster is on the segment joining the old ones. W +## is the ratio between the weights of clusters I and J. Use the law of +## cosines to find the distances of the new cluster from all the others. +function y = massdist (x, w) + c = x(1, x(2,:) == Inf); # distance between component clusters + q = 1 / (1 + w); # ratio of distance position + y = sqrt ((1-q)*x(1,:).^2 + q*x(2,:).^2 + (q^2-q)*c^2); endfunction + %!shared x, y, t %! x = [3 1.7; 1 1; 2 3; 2 2.5; 1.2 1; 1.1 1.5; 3 1]; %! y = reshape(mod(magic(6),5),[],3); %! t = 1e-6; -%!assert (cond (linkage (pdist (x))), 56.981591, t); -%!assert (cond (linkage (pdist (y))), 27.771542, t); -%!assert (cond (linkage (pdist (x), "single")), 56.981591, t); -%!assert (cond (linkage (pdist (y), "single")), 27.771542, t); -%!assert (cond (linkage (pdist (x), "complete")), 26.681691, t); -%!assert (cond (linkage (pdist (y), "complete")), 21.726318, t); -%!assert (cond (linkage (pdist (x), "median")), 39.032841, t); -%!assert (cond (linkage (pdist (y), "median")), 26.159331, t); +%! disp ("linkage: should emit 4 warnings\n\t about the \"centroid\" and \"median\" methods"); +%!assert (cond (linkage (pdist (x))), 55.787151, t); +%!assert (cond (linkage (pdist (y))), 34.119045, t); +%!assert (cond (linkage (pdist (x), "complete")), 27.506710, t); +%!assert (cond (linkage (pdist (y), "complete")), 21.793345, t); +%!assert (cond (linkage (pdist (x), "centroid")), 39.104461, t); +%!assert (cond (linkage (pdist (y), "centroid")), 27.457477, t); +%!assert (cond (linkage (pdist (x), "median")), 39.671458, t); +%!assert (cond (linkage (pdist (y), "median")), 27.683325, t); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <fp...@us...> - 2008-11-18 17:45:58
|
Revision: 5443 http://octave.svn.sourceforge.net/octave/?rev=5443&view=rev Author: fpoto Date: 2008-11-18 17:45:18 +0000 (Tue, 18 Nov 2008) Log Message: ----------- Added the WPGMA ("weighted") test. Removed Bill Denney's copyright notice, as only a couple of lines remain that originate from his code. Modified Paths: -------------- trunk/octave-forge/main/statistics/inst/linkage.m Modified: trunk/octave-forge/main/statistics/inst/linkage.m =================================================================== --- trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-18 12:18:48 UTC (rev 5442) +++ trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-18 17:45:18 UTC (rev 5443) @@ -1,4 +1,3 @@ -## Copyright (C) 2006, 2008 Bill Denney <bi...@de...> ## Copyright (C) 2008 Francesco Potort\xEC <po...@gn...> ## ## This software is free software; you can redistribute it and/or modify @@ -19,8 +18,8 @@ ## @deftypefn {Function File} {@var{y} =} linkage (@var{x}) ## @deftypefnx {Function File} {@var{y} =} linkage (@var{x}, @var{method}) ## -## Return clusters generated from a distance vector created by the pdist -## function. +## Produce a hierarchical clustering dendrogram from a distance vector +## created by the @code{pdist} function. ## ## @var{x} is the dissimilarity matrix relative to @var{n} observations, ## formatted as a @math{(n-1)*n/2}x1 vector as produced by @code{pdist}. @@ -52,7 +51,8 @@ ## ## @item "weighted" ## Weighted pair group method with averaging (WPGMA) -## NOT IMPLEMENTED +## When two clusters A and B are joined together, the new distance to a +## cluster C is the mean between distances A-C and B-C. ## ## @item "centroid" ## Unweighted Pair-Group Method using Centroids (UPGMC) @@ -91,43 +91,42 @@ error ("linkage: x must be a vector"); endif - quickmethods = { "single", "complete", "centroid", "median" }; - slowmethods = { "weighted", "average", "ward" }; + quickmethods = { "single", "complete", "weighted", "centroid", "median" }; + slowmethods = { "average", "ward" }; method = lower (method); switch (method) ## These methods do not require to keep memory of merged clusters case (quickmethods) - distfunction = {(@(x,w) min(x)); (@(x,w) max(x)) - (@(x,w) massdist(x,w)); (@(x,w) massdist(x,1))}; + distfunction = {(@(x) min(x)) + (@(x) max(x)) + (@(x) mean(x)) + (@(x,w) massdist(x,w)) + (@(x) massdist(x))}; dist = distfunction {strcmp (method, quickmethods)}; dissim = squareform (x, "tomatrix"); # dissimilarity NxN matrix n = rows (dissim); # the number of observations diagidx = sub2ind ([n,n], 1:n, 1:n); # indices of diagonal elements dissim(diagidx) = Inf; # consider a cluster as far from itself - cname = 1:n; # cluster names in dissim + ## For equal-distance nodes, the order in which clusters are + ## merged is arbitrary, but some methods can produce different + ## clusterings depending on it. Rotating the initial matrix + ## produces an ordering more similar to Matlab's. + dissim = rot90 (dissim, 2); + cname = n:-1:1; # cluster names in dissim weight = ones (1, n); # cluster weights y = zeros (n-1, 3); # clusters from n+1 to 2*n-1 for yidx = 1:n-1 ## Find the two nearest clusters - ## For equal-distance nodes, the order in which nodes are added - ## to clusters is arbitrary. The two commented lines of code - ## are simplest, but the two uncommented ones choose the nodes - ## so to get an ordering closer to Matlab's. Note that the - ## weighted methods can produce different clusterings depending - ## on the order in which elements are added to clusters. - ###[m midx] = min (dissim(:)); - ###[r, c] = ind2sub (size (dissim), midx); - [r c] = find (dissim == min (dissim(:)), 1, "last"); - if (cname(r) > cname(c)) [r c] = swap (r, c); endif + [m midx] = min (dissim(:)); + [r, c] = ind2sub (size (dissim), midx); ## Here is the new cluster y(yidx, :) = [cname(r) cname(c) dissim(r, c)]; ## Put it in place of the first one and remove the second - cname(r) = yidx + n; + cname(r) = n + yidx; cname(c) = []; - ## Compute the new distances. The dist function must return a - ## scalar from a vector and a row from a matrix. + ## Compute the new distances d = dist (dissim([r c], :), weight(r)/weight(c)); d(r) = Inf; # take care of the diagonal element ## Put distances in place of the first ones, remove the second ones @@ -139,6 +138,8 @@ weight(r) += weight(c); weight(c) = []; endfor + ## Sort the cluster numbers, as Matlab does + y(:,1:2) = sort (y(:,1:2), 2); ## These methods require a list of elements per merged clusters case (slowmethods) @@ -163,7 +164,7 @@ ## centroid of the new cluster is on the segment joining the old ones. W ## is the ratio between the weights of clusters I and J. Use the law of ## cosines to find the distances of the new cluster from all the others. -function y = massdist (x, w) +function y = massdist (x, w = 1) c = x(1, x(2,:) == Inf); # distance between component clusters q = 1 / (1 + w); # ratio of distance position y = sqrt ((1-q)*x(1,:).^2 + q*x(2,:).^2 + (q^2-q)*c^2); @@ -179,6 +180,8 @@ %!assert (cond (linkage (pdist (y))), 34.119045, t); %!assert (cond (linkage (pdist (x), "complete")), 27.506710, t); %!assert (cond (linkage (pdist (y), "complete")), 21.793345, t); +%!assert (cond (linkage (pdist (x), "weighted")), 36.257913, t); +%!assert (cond (linkage (pdist (y), "weighted")), 27.412889, t); %!assert (cond (linkage (pdist (x), "centroid")), 39.104461, t); %!assert (cond (linkage (pdist (y), "centroid")), 27.457477, t); %!assert (cond (linkage (pdist (x), "median")), 39.671458, t); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <fp...@us...> - 2008-11-18 22:43:53
|
Revision: 5444 http://octave.svn.sourceforge.net/octave/?rev=5444&view=rev Author: fpoto Date: 2008-11-18 22:43:46 +0000 (Tue, 18 Nov 2008) Log Message: ----------- Implement "average" (UPGMA) method. Modified Paths: -------------- trunk/octave-forge/main/statistics/inst/linkage.m Modified: trunk/octave-forge/main/statistics/inst/linkage.m =================================================================== --- trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-18 17:45:18 UTC (rev 5443) +++ trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-18 22:43:46 UTC (rev 5444) @@ -47,7 +47,6 @@ ## Unweighted pair group method with averaging (UPGMA) ## The mean distance between all pair of elements each belonging to one ## cluster. -## NOT IMPLEMENTED ## ## @item "weighted" ## Weighted pair group method with averaging (WPGMA) @@ -91,20 +90,19 @@ error ("linkage: x must be a vector"); endif - quickmethods = { "single", "complete", "weighted", "centroid", "median" }; - slowmethods = { "average", "ward" }; - + methods = { "single", "complete", "average", "weighted", "centroid", "median" }; method = lower (method); switch (method) - ## These methods do not require to keep memory of merged clusters - case (quickmethods) - distfunction = {(@(x) min(x)) - (@(x) max(x)) - (@(x) mean(x)) - (@(x,w) massdist(x,w)) - (@(x) massdist(x))}; - dist = distfunction {strcmp (method, quickmethods)}; + case (methods) + distfunction = ... + {(@(x) min(x)) # single + (@(x) max(x)) # complete + (@(x,q) sum(dmult(q,x)) / sum(q)) # average + (@(x) mean(x)) # weighted + (@(x,q) massdist(x,q)) # centroid + (@(x) massdist(x))}; # median + dist = distfunction {strcmp (method, methods)}; dissim = squareform (x, "tomatrix"); # dissimilarity NxN matrix n = rows (dissim); # the number of observations diagidx = sub2ind ([n,n], 1:n, 1:n); # indices of diagonal elements @@ -127,7 +125,7 @@ cname(r) = n + yidx; cname(c) = []; ## Compute the new distances - d = dist (dissim([r c], :), weight(r)/weight(c)); + d = dist (dissim([r c], :), weight([r c])); d(r) = Inf; # take care of the diagonal element ## Put distances in place of the first ones, remove the second ones dissim(r,:) = d; @@ -141,8 +139,7 @@ ## Sort the cluster numbers, as Matlab does y(:,1:2) = sort (y(:,1:2), 2); - ## These methods require a list of elements per merged clusters - case (slowmethods) + case "ward" error ("linkage: %s is not yet implemented", method); otherwise @@ -153,7 +150,7 @@ if (any (diff (y(:,3)) < 0)) warning ("clustering", "linkage: cluster distances do not monotonically increase\n\ - you should maybe use a method different from \"%s\"", method); + you should probably use a method different from \"%s\"", method); endif endfunction @@ -162,12 +159,13 @@ ## and J from the others. Column J of second row contains Inf, column J ## of first row contains the distance between clusters I and J. The ## centroid of the new cluster is on the segment joining the old ones. W -## is the ratio between the weights of clusters I and J. Use the law of -## cosines to find the distances of the new cluster from all the others. -function y = massdist (x, w = 1) +## are the weights of clusters I and J. Use the law of cosines to find +## the distances of the new cluster from all the others. +function y = massdist (x, w = [1 1]) c = x(1, x(2,:) == Inf); # distance between component clusters - q = 1 / (1 + w); # ratio of distance position - y = sqrt ((1-q)*x(1,:).^2 + q*x(2,:).^2 + (q^2-q)*c^2); + w /= sum (w); # ratio of distance position + q2 = w(2); + y = sqrt (w(1)*x(1,:).^2 + q2*(x(2,:).^2 + (q2-1)*c^2)); endfunction @@ -180,6 +178,8 @@ %!assert (cond (linkage (pdist (y))), 34.119045, t); %!assert (cond (linkage (pdist (x), "complete")), 27.506710, t); %!assert (cond (linkage (pdist (y), "complete")), 21.793345, t); +%!assert (cond (linkage (pdist (x), "average")), 35.766804, t); +%!assert (cond (linkage (pdist (y), "average")), 27.045012, t); %!assert (cond (linkage (pdist (x), "weighted")), 36.257913, t); %!assert (cond (linkage (pdist (y), "weighted")), 27.412889, t); %!assert (cond (linkage (pdist (x), "centroid")), 39.104461, t); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <fp...@us...> - 2008-11-19 21:55:50
|
Revision: 5447 http://octave.svn.sourceforge.net/octave/?rev=5447&view=rev Author: fpoto Date: 2008-11-19 21:31:26 +0000 (Wed, 19 Nov 2008) Log Message: ----------- Implement the Ward's method. Now the linkage.m is Matlab compatible, at least as far as the methods are concerned. Modified Paths: -------------- trunk/octave-forge/main/statistics/inst/linkage.m Modified: trunk/octave-forge/main/statistics/inst/linkage.m =================================================================== --- trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-19 19:14:14 UTC (rev 5446) +++ trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-19 21:31:26 UTC (rev 5447) @@ -15,13 +15,13 @@ ## <http://www.gnu.org/licenses/>. ## -*- texinfo -*- -## @deftypefn {Function File} {@var{y} =} linkage (@var{x}) -## @deftypefnx {Function File} {@var{y} =} linkage (@var{x}, @var{method}) +## @deftypefn {Function File} {@var{y} =} linkage (@var{d}) +## @deftypefnx {Function File} {@var{y} =} linkage (@var{d}, @var{method}) ## ## Produce a hierarchical clustering dendrogram from a distance vector ## created by the @code{pdist} function. ## -## @var{x} is the dissimilarity matrix relative to @var{n} observations, +## @var{d} is the dissimilarity matrix relative to @var{n} observations, ## formatted as a @math{(n-1)*n/2}x1 vector as produced by @code{pdist}. ## @code{linkage} starts by putting each observation into a singleton ## cluster and numbering those from 1 to @var{n}. Then it merges two @@ -32,7 +32,8 @@ ## two columns are the numbers of the two component clusters and column ## 3 contains their distance. ## -## Methods define the way the distance between two clusters is computed: +## @var{method} defines the way the distance between two clusters is +## computed and how they are recomputed when two clusters are merged: ## ## @table @samp ## @item "single" (default) @@ -44,17 +45,17 @@ ## Furthest distance between two elements belonging each to one cluster. ## ## @item "average" -## Unweighted pair group method with averaging (UPGMA) +## Unweighted pair group method with averaging (UPGMA). ## The mean distance between all pair of elements each belonging to one ## cluster. ## ## @item "weighted" -## Weighted pair group method with averaging (WPGMA) +## Weighted pair group method with averaging (WPGMA). ## When two clusters A and B are joined together, the new distance to a ## cluster C is the mean between distances A-C and B-C. ## ## @item "centroid" -## Unweighted Pair-Group Method using Centroids (UPGMC) +## Unweighted Pair-Group Method using Centroids (UPGMC). ## Assumes Euclidean metric. The distance between cluster centroids, ## each centroid being the center of mass of a cluster. ## @@ -65,17 +66,23 @@ ## between the joined centroids. ## ## @item "ward" -## Inner squared distance (minimum variance) -## NOT IMPLEMENTED -## +## Ward's sum of squared deviations about the group mean (ESS). +## Also known as minimum variance or inner squared distance. +## Assumes Euclidean metric. How much the moment of inertia of the +## merged cluster exceeds the sum of those of the individual clusters. ## @end table ## -## @seealso{cluster,pdist} +## @strong{Reference} +## Ward, J. H. Hierarchical Grouping to Optimize an Objective Function +## J. Am. Statist. Assoc. 1963, 58, 236-244, +## @url{http://iv.slis.indiana.edu/sw/data/ward.pdf}. ## @end deftypefn +## +## @seealso{pdist,squareform} -## Author: Bill Denney <denney@...> +## Author: Francesco Potort\xEC <po...@gn...> -function y = linkage (x, method) +function dgram = linkage (d, method) ## check the input if (nargin < 1) || (nargin > 2) @@ -84,70 +91,67 @@ method = "single"; endif - if (isempty (x)) - error ("linkage: x cannot be empty"); - elseif (~ isvector (x)) - error ("linkage: x must be a vector"); + if (isempty (d)) + error ("linkage: d cannot be empty"); + elseif (~ isvector (d)) + error ("linkage: d must be a vector"); endif - methods = { "single", "complete", "average", "weighted", "centroid", "median" }; - method = lower (method); - switch (method) + methods = struct ... + ("name", { "single"; "complete"; "average"; "weighted"; + "centroid"; "median"; "ward" }, + "distfunc", {(@(x) min(x)) # single + (@(x) max(x)) # complete + (@(x,i,j,w) sum(dmult(q=w([i,j]),x))/sum(q)) # average + (@(x) mean(x)) # weighted + (@massdist) # centroid + (@(x,i) massdist(x,i)) # median + (@inertialdist) # ward + }); - case (methods) - distfunction = ... - {(@(x) min(x)) # single - (@(x) max(x)) # complete - (@(x,q) sum(dmult(q,x)) / sum(q)) # average - (@(x) mean(x)) # weighted - (@(x,q) massdist(x,q)) # centroid - (@(x) massdist(x))}; # median - dist = distfunction {strcmp (method, methods)}; - dissim = squareform (x, "tomatrix"); # dissimilarity NxN matrix - n = rows (dissim); # the number of observations - diagidx = sub2ind ([n,n], 1:n, 1:n); # indices of diagonal elements - dissim(diagidx) = Inf; # consider a cluster as far from itself - ## For equal-distance nodes, the order in which clusters are - ## merged is arbitrary, but some methods can produce different - ## clusterings depending on it. Rotating the initial matrix - ## produces an ordering more similar to Matlab's. - dissim = rot90 (dissim, 2); - cname = n:-1:1; # cluster names in dissim - weight = ones (1, n); # cluster weights - y = zeros (n-1, 3); # clusters from n+1 to 2*n-1 - for yidx = 1:n-1 - ## Find the two nearest clusters - [m midx] = min (dissim(:)); - [r, c] = ind2sub (size (dissim), midx); - ## Here is the new cluster - y(yidx, :) = [cname(r) cname(c) dissim(r, c)]; - ## Put it in place of the first one and remove the second - cname(r) = n + yidx; - cname(c) = []; - ## Compute the new distances - d = dist (dissim([r c], :), weight([r c])); - d(r) = Inf; # take care of the diagonal element - ## Put distances in place of the first ones, remove the second ones - dissim(r,:) = d; - dissim(:,r) = d'; - dissim(c,:) = []; - dissim(:,c) = []; - ## The new weight is the sum of the components' weights - weight(r) += weight(c); - weight(c) = []; - endfor - ## Sort the cluster numbers, as Matlab does - y(:,1:2) = sort (y(:,1:2), 2); + mask = strcmp (lower (method), {methods.name}); + if (! any (mask)) + error ("linkage: %s: unknown method", method); + endif + dist = {methods.distfunc}{mask}; - case "ward" - error ("linkage: %s is not yet implemented", method); + d = squareform (d, "tomatrix"); # dissimilarity NxN matrix + n = rows (d); # the number of observations + diagidx = sub2ind ([n,n], 1:n, 1:n); # indices of diagonal elements + d(diagidx) = Inf; # consider a cluster as far from itself + ## For equal-distance nodes, the order in which clusters are + ## merged is arbitrary. Rotating the initial matrix produces an + ## ordering similar to Matlab's. + cname = n:-1:1; # cluster names in d + d = rot90 (d, 2); # exchange low and high cluster numbers + weight = ones (1, n); # cluster weights + dgram = zeros (n-1, 3); # clusters from n+1 to 2*n-1 + for cluster = n+1:2*n-1 + ## Find the two nearest clusters + [m midx] = min (d(:)); + [r, c] = ind2sub (size (d), midx); + ## Here is the new cluster + dgram(cluster-n, :) = [cname(r) cname(c) d(r, c)]; + ## Put it in place of the first one and remove the second + cname(r) = cluster; + cname(c) = []; + ## Compute the new distances + newd = dist (d([r c], :), r, c, weight); + newd(r) = Inf; # take care of the diagonal element + ## Put distances in place of the first ones, remove the second ones + d(r,:) = newd; + d(:,r) = newd'; + d(c,:) = []; + d(:,c) = []; + ## The new weight is the sum of the components' weights + weight(r) += weight(c); + weight(c) = []; + endfor + ## Sort the cluster numbers, as Matlab does + dgram(:,1:2) = sort (dgram(:,1:2), 2); - otherwise - error ("linkage: %s: unknown method", method); - endswitch - ## Check that distances are monotonically increasing - if (any (diff (y(:,3)) < 0)) + if (any (diff (dgram(:,3)) < 0)) warning ("clustering", "linkage: cluster distances do not monotonically increase\n\ you should probably use a method different from \"%s\"", method); @@ -156,33 +160,57 @@ endfunction ## Take two row vectors, which are the Euclidean distances of clusters I -## and J from the others. Column J of second row contains Inf, column J -## of first row contains the distance between clusters I and J. The -## centroid of the new cluster is on the segment joining the old ones. W -## are the weights of clusters I and J. Use the law of cosines to find -## the distances of the new cluster from all the others. -function y = massdist (x, w = [1 1]) - c = x(1, x(2,:) == Inf); # distance between component clusters - w /= sum (w); # ratio of distance position - q2 = w(2); - y = sqrt (w(1)*x(1,:).^2 + q2*(x(2,:).^2 + (q2-1)*c^2)); +## and J from the others. Column I of second row contains the distance +## between clusters I and J. The centre of gravity of the new cluster +## is on the segment joining the old ones. W are the weights of all +## clusters. Use the law of cosines to find the distances of the new +## cluster from all the others. +function y = massdist (x, i, j, w) + c = x(2, i); # distance between I and J + if (nargin < 4) # median distance + qi = qj = 0.5; # equal weights ("weighted") + else # centroid distance + qi = w(i); qj = w(j); # the cluster weights + norm = qi + qj; # normalisation factor + qi /= norm; qj /= norm; # normalise them ("unweighted") + endif + y = sqrt (qi*x(1,:).^2 + qj*(x(2,:).^2 - (1-qj)*c^2)); endfunction +## Take two row vectors, which are the inertial distances of clusters I +## and J from the others. Column I of second row contains the inertial +## distance between clusters I and J. The centre of gravity of the new +## cluster K is on the segment joining I and J. W are the weights of +## all clusters. Use the law of cosines to find the distance of K from +## all the other clusters, then compute the intertial distance of K from +## all the other clusters and return it. +function y = inertialdist (x, i, j, w) + wi = w(i); wj = w(j); # the cluster weights + sij = wi + wj; # sum of weights of I and J + c2 = x(2,i)^2 * sij / wi / wj; # squared Eucl. dist. between I and J + s = [wi + w; wj + w]; # sum of weights for all cluster pairs + p = [wi * w; wj * w]; # product of weights for all cluster pairs + ed2 = x.^2 .* s ./ p; # convert inertial dist. to squared Eucl. + qi = wi/sij; qj = wj/sij; # normalise the weights of I and J + ## Squared Euclidean distances between all clusters and new cluster K + ed2 = qi*ed2(1,:) + qj*(ed2(2,:) - (1-qj)*c2); + y = sqrt (ed2 * sij .* w ./ (sij + w)); # convert Eucl. dist. to inertial +endfunction -%!shared x, y, t -%! x = [3 1.7; 1 1; 2 3; 2 2.5; 1.2 1; 1.1 1.5; 3 1]; -%! y = reshape(mod(magic(6),5),[],3); + +%!shared x, t +%! x = reshape(mod(magic(6),5),[],3); %! t = 1e-6; -%! disp ("linkage: should emit 4 warnings\n\t about the \"centroid\" and \"median\" methods"); -%!assert (cond (linkage (pdist (x))), 55.787151, t); -%!assert (cond (linkage (pdist (y))), 34.119045, t); -%!assert (cond (linkage (pdist (x), "complete")), 27.506710, t); -%!assert (cond (linkage (pdist (y), "complete")), 21.793345, t); -%!assert (cond (linkage (pdist (x), "average")), 35.766804, t); -%!assert (cond (linkage (pdist (y), "average")), 27.045012, t); -%!assert (cond (linkage (pdist (x), "weighted")), 36.257913, t); -%!assert (cond (linkage (pdist (y), "weighted")), 27.412889, t); -%!assert (cond (linkage (pdist (x), "centroid")), 39.104461, t); -%!assert (cond (linkage (pdist (y), "centroid")), 27.457477, t); -%!assert (cond (linkage (pdist (x), "median")), 39.671458, t); -%!assert (cond (linkage (pdist (y), "median")), 27.683325, t); +%!assert (cond (linkage (pdist (x))), 34.119045, t); +%!assert (cond (linkage (pdist (x), "complete")), 21.793345, t); +%!assert (cond (linkage (pdist (x), "average")), 27.045012, t); +%!assert (cond (linkage (pdist (x), "weighted")), 27.412889, t); +%!test warning off clustering +%! assert (cond (linkage (pdist (x), "centroid")),27.457477, t); +%! warning on clustering +%!warning <monotonically> linkage (pdist (x), "centroid"); +%!test warning off clustering +%! assert (cond (linkage (pdist (x), "median")), 27.683325, t); +%! warning on clustering +%!warning <monotonically> linkage (pdist (x), "median"); +%!assert (cond (linkage (pdist (x), "ward")), 17.195198, t); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <fp...@us...> - 2008-11-20 19:02:14
|
Revision: 5448 http://octave.svn.sourceforge.net/octave/?rev=5448&view=rev Author: fpoto Date: 2008-11-20 19:02:07 +0000 (Thu, 20 Nov 2008) Log Message: ----------- Implement the remaining calling methods. Slight optimisation of centroid and median cases. Comments clarified. Modified Paths: -------------- trunk/octave-forge/main/statistics/inst/linkage.m Modified: trunk/octave-forge/main/statistics/inst/linkage.m =================================================================== --- trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-19 21:31:26 UTC (rev 5447) +++ trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-20 19:02:07 UTC (rev 5448) @@ -17,12 +17,20 @@ ## -*- texinfo -*- ## @deftypefn {Function File} {@var{y} =} linkage (@var{d}) ## @deftypefnx {Function File} {@var{y} =} linkage (@var{d}, @var{method}) +## @deftypefnx {Function File} @ +## {@var{y} =} linkage (@var{x}, @var{method}, @var{metric}) +## @deftypefnx {Function File} @ +## {@var{y} =} linkage (@var{x}, @var{method}, @var{arglist}) ## -## Produce a hierarchical clustering dendrogram from a distance vector -## created by the @code{pdist} function. +## Produce a hierarchical clustering dendrogram ## ## @var{d} is the dissimilarity matrix relative to @var{n} observations, ## formatted as a @math{(n-1)*n/2}x1 vector as produced by @code{pdist}. +## Alternatively, @var{x} contains data formatted for input to +## @code{pdist}, @var{metric} is a metric for @code{pdist} and +## @var{arglist} is a cell array containing arguments that are passed to +## @code{pdist}. +## ## @code{linkage} starts by putting each observation into a singleton ## cluster and numbering those from 1 to @var{n}. Then it merges two ## clusters, chosen according to @var{method}, to create a new cluster @@ -82,18 +90,16 @@ ## Author: Francesco Potort\xEC <po...@gn...> -function dgram = linkage (d, method) +function dgram = linkage (d, method = "single", distarg) ## check the input - if (nargin < 1) || (nargin > 2) + if (nargin < 1) || (nargin > 3) print_usage (); - elseif (nargin < 2) - method = "single"; endif if (isempty (d)) error ("linkage: d cannot be empty"); - elseif (~ isvector (d)) + elseif ( nargin < 3 && ~ isvector (d)) error ("linkage: d must be a vector"); endif @@ -108,13 +114,22 @@ (@(x,i) massdist(x,i)) # median (@inertialdist) # ward }); - mask = strcmp (lower (method), {methods.name}); if (! any (mask)) error ("linkage: %s: unknown method", method); endif dist = {methods.distfunc}{mask}; + if (nargin == 3) + if (ischar (distarg)) + d = pdist (d, distarg); + elseif (iscell (distarg)) + d = pdist (d, distarg{:}); + else + print_usage (); + endif + endif + d = squareform (d, "tomatrix"); # dissimilarity NxN matrix n = rows (d); # the number of observations diagidx = sub2ind ([n,n], 1:n, 1:n); # indices of diagonal elements @@ -166,24 +181,24 @@ ## clusters. Use the law of cosines to find the distances of the new ## cluster from all the others. function y = massdist (x, i, j, w) - c = x(2, i); # distance between I and J + x .^= 2; # squared Euclidean distances + c2 = x(2, i); # squared distance between I and J if (nargin < 4) # median distance - qi = qj = 0.5; # equal weights ("weighted") + qi = 0.5; # equal weights ("weighted") else # centroid distance - qi = w(i); qj = w(j); # the cluster weights - norm = qi + qj; # normalisation factor - qi /= norm; qj /= norm; # normalise them ("unweighted") + qi = 1 / (1 + w(j) / w(i)); # the cluster weights endif - y = sqrt (qi*x(1,:).^2 + qj*(x(2,:).^2 - (1-qj)*c^2)); + y = sqrt (qi*x(1,:) + (1-qi)*(x(2,:) - qi*c2)); endfunction ## Take two row vectors, which are the inertial distances of clusters I ## and J from the others. Column I of second row contains the inertial ## distance between clusters I and J. The centre of gravity of the new ## cluster K is on the segment joining I and J. W are the weights of -## all clusters. Use the law of cosines to find the distance of K from -## all the other clusters, then compute the intertial distance of K from -## all the other clusters and return it. +## all clusters. Convert inertial to Euclidean distances, then use the +## law of cosines to find the Euclidean distances of K from all the +## other clusters, convert them back to inertial distances and return +## them. function y = inertialdist (x, i, j, w) wi = w(i); wj = w(j); # the cluster weights sij = wi + wj; # sum of weights of I and J @@ -201,16 +216,19 @@ %!shared x, t %! x = reshape(mod(magic(6),5),[],3); %! t = 1e-6; -%!assert (cond (linkage (pdist (x))), 34.119045, t); -%!assert (cond (linkage (pdist (x), "complete")), 21.793345, t); -%!assert (cond (linkage (pdist (x), "average")), 27.045012, t); -%!assert (cond (linkage (pdist (x), "weighted")), 27.412889, t); +%!assert (cond (linkage (pdist (x))), 34.119045, t); +%!assert (cond (linkage (pdist (x), "complete")), 21.793345, t); +%!assert (cond (linkage (pdist (x), "average")), 27.045012, t); +%!assert (cond (linkage (pdist (x), "weighted")), 27.412889, t); +%!warning <monotonically> linkage (pdist (x), "centroid"); %!test warning off clustering -%! assert (cond (linkage (pdist (x), "centroid")),27.457477, t); +%! assert (cond (linkage (pdist (x), "centroid")), 27.457477, t); %! warning on clustering -%!warning <monotonically> linkage (pdist (x), "centroid"); +%!warning <monotonically> linkage (pdist (x), "median"); %!test warning off clustering -%! assert (cond (linkage (pdist (x), "median")), 27.683325, t); +%! assert (cond (linkage (pdist (x), "median")), 27.683325, t); %! warning on clustering -%!warning <monotonically> linkage (pdist (x), "median"); -%!assert (cond (linkage (pdist (x), "ward")), 17.195198, t); +%!assert (cond (linkage (pdist (x), "ward")), 17.195198, t); +%!assert (cond (linkage(x,"ward","euclidean")), 17.195198, t); +%!assert (cond (linkage(x,"ward",{"euclidean"})), 17.195198, t); +%!assert (cond (linkage(x,"ward",{"minkowski", 2})),17.195198, t); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <fp...@us...> - 2008-11-20 21:29:29
|
Revision: 5449 http://octave.svn.sourceforge.net/octave/?rev=5449&view=rev Author: fpoto Date: 2008-11-20 21:29:21 +0000 (Thu, 20 Nov 2008) Log Message: ----------- Slight optimisation of inertialdist(). Modified Paths: -------------- trunk/octave-forge/main/statistics/inst/linkage.m Modified: trunk/octave-forge/main/statistics/inst/linkage.m =================================================================== --- trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-20 19:02:07 UTC (rev 5448) +++ trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-20 21:29:21 UTC (rev 5449) @@ -200,16 +200,17 @@ ## other clusters, convert them back to inertial distances and return ## them. function y = inertialdist (x, i, j, w) + x .^= 2; # squared inertial distances wi = w(i); wj = w(j); # the cluster weights sij = wi + wj; # sum of weights of I and J - c2 = x(2,i)^2 * sij / wi / wj; # squared Eucl. dist. between I and J + c2 = x(2,i) * sij / wi / wj; # squared Eucl. dist. between I and J s = [wi + w; wj + w]; # sum of weights for all cluster pairs - p = [wi * w; wj * w]; # product of weights for all cluster pairs - ed2 = x.^2 .* s ./ p; # convert inertial dist. to squared Eucl. - qi = wi/sij; qj = wj/sij; # normalise the weights of I and J + p = [wi * w; wj * w]; # product of weights for all cluster pairs + x .*= s ./ p; # convert inertial dist. to squared Eucl. + qi = wi/sij; # normalise the weights of I and J ## Squared Euclidean distances between all clusters and new cluster K - ed2 = qi*ed2(1,:) + qj*(ed2(2,:) - (1-qj)*c2); - y = sqrt (ed2 * sij .* w ./ (sij + w)); # convert Eucl. dist. to inertial + x = qi*x(1,:) + (1-qi)*(x(2,:) - qi*c2); + y = sqrt (x * sij .* w ./ (sij + w)); # convert Eucl. dist. to inertial endfunction This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <fp...@us...> - 2008-11-21 07:54:25
|
Revision: 5450 http://octave.svn.sourceforge.net/octave/?rev=5450&view=rev Author: fpoto Date: 2008-11-21 07:54:22 +0000 (Fri, 21 Nov 2008) Log Message: ----------- Some code simplifications. Modified Paths: -------------- trunk/octave-forge/main/statistics/inst/linkage.m Modified: trunk/octave-forge/main/statistics/inst/linkage.m =================================================================== --- trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-20 21:29:21 UTC (rev 5449) +++ trunk/octave-forge/main/statistics/inst/linkage.m 2008-11-21 07:54:22 UTC (rev 5450) @@ -182,13 +182,12 @@ ## cluster from all the others. function y = massdist (x, i, j, w) x .^= 2; # squared Euclidean distances - c2 = x(2, i); # squared distance between I and J - if (nargin < 4) # median distance + if (nargin == 2) # median distance qi = 0.5; # equal weights ("weighted") else # centroid distance - qi = 1 / (1 + w(j) / w(i)); # the cluster weights + qi = 1 / (1 + w(j) / w(i)); # proportional weights ("unweighted") endif - y = sqrt (qi*x(1,:) + (1-qi)*(x(2,:) - qi*c2)); + y = sqrt (qi*x(1,:) + (1-qi)*(x(2,:) - qi*x(2,i))); endfunction ## Take two row vectors, which are the inertial distances of clusters I @@ -200,16 +199,14 @@ ## other clusters, convert them back to inertial distances and return ## them. function y = inertialdist (x, i, j, w) - x .^= 2; # squared inertial distances - wi = w(i); wj = w(j); # the cluster weights - sij = wi + wj; # sum of weights of I and J - c2 = x(2,i) * sij / wi / wj; # squared Eucl. dist. between I and J - s = [wi + w; wj + w]; # sum of weights for all cluster pairs - p = [wi * w; wj * w]; # product of weights for all cluster pairs - x .*= s ./ p; # convert inertial dist. to squared Eucl. - qi = wi/sij; # normalise the weights of I and J + wi = w(i); wj = w(j); # the cluster weights + s = [wi + w; wj + w]; # sum of weights for all cluster pairs + p = [wi * w; wj * w]; # product of weights for all cluster pairs + x = x.^2 .* s ./ p; # convert inertial dist. to squared Eucl. + sij = wi + wj; # sum of weights of I and J + qi = wi/sij; # normalise the weight of I ## Squared Euclidean distances between all clusters and new cluster K - x = qi*x(1,:) + (1-qi)*(x(2,:) - qi*c2); + x = qi*x(1,:) + (1-qi)*(x(2,:) - qi*x(2,i)); y = sqrt (x * sij .* w ./ (sij + w)); # convert Eucl. dist. to inertial endfunction @@ -217,19 +214,19 @@ %!shared x, t %! x = reshape(mod(magic(6),5),[],3); %! t = 1e-6; -%!assert (cond (linkage (pdist (x))), 34.119045, t); -%!assert (cond (linkage (pdist (x), "complete")), 21.793345, t); -%!assert (cond (linkage (pdist (x), "average")), 27.045012, t); -%!assert (cond (linkage (pdist (x), "weighted")), 27.412889, t); +%!assert (cond (linkage (pdist (x))), 34.119045,t); +%!assert (cond (linkage (pdist (x), "complete")), 21.793345,t); +%!assert (cond (linkage (pdist (x), "average")), 27.045012,t); +%!assert (cond (linkage (pdist (x), "weighted")), 27.412889,t); %!warning <monotonically> linkage (pdist (x), "centroid"); %!test warning off clustering -%! assert (cond (linkage (pdist (x), "centroid")), 27.457477, t); +%! assert (cond (linkage (pdist (x), "centroid")), 27.457477,t); %! warning on clustering %!warning <monotonically> linkage (pdist (x), "median"); %!test warning off clustering -%! assert (cond (linkage (pdist (x), "median")), 27.683325, t); +%! assert (cond (linkage (pdist (x), "median")), 27.683325,t); %! warning on clustering -%!assert (cond (linkage (pdist (x), "ward")), 17.195198, t); -%!assert (cond (linkage(x,"ward","euclidean")), 17.195198, t); -%!assert (cond (linkage(x,"ward",{"euclidean"})), 17.195198, t); -%!assert (cond (linkage(x,"ward",{"minkowski", 2})),17.195198, t); +%!assert (cond (linkage (pdist (x), "ward")), 17.195198,t); +%!assert (cond (linkage(x,"ward","euclidean")), 17.195198,t); +%!assert (cond (linkage(x,"ward",{"euclidean"})), 17.195198,t); +%!assert (cond (linkage(x,"ward",{"minkowski",2})),17.195198,t); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <ha...@us...> - 2010-01-09 21:01:14
|
Revision: 6725 http://octave.svn.sourceforge.net/octave/?rev=6725&view=rev Author: hauberg Date: 2010-01-09 21:01:06 +0000 (Sat, 09 Jan 2010) Log Message: ----------- Replace 'dmult' usage with more modern approach Modified Paths: -------------- trunk/octave-forge/main/statistics/inst/linkage.m Modified: trunk/octave-forge/main/statistics/inst/linkage.m =================================================================== --- trunk/octave-forge/main/statistics/inst/linkage.m 2010-01-09 16:14:15 UTC (rev 6724) +++ trunk/octave-forge/main/statistics/inst/linkage.m 2010-01-09 21:01:06 UTC (rev 6725) @@ -108,7 +108,7 @@ "centroid"; "median"; "ward" }, "distfunc", {(@(x) min(x)) # single (@(x) max(x)) # complete - (@(x,i,j,w) sum(dmult(q=w([i,j]),x))/sum(q)) # average + (@(x,i,j,w) sum(diag(q=w([i,j]))*x)/sum(q)) # average (@(x) mean(x)) # weighted (@massdist) # centroid (@(x,i) massdist(x,i)) # median This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <car...@us...> - 2012-03-14 22:54:45
|
Revision: 9891 http://octave.svn.sourceforge.net/octave/?rev=9891&view=rev Author: carandraug Date: 2012-03-14 22:54:39 +0000 (Wed, 14 Mar 2012) Log Message: ----------- linkage: clear previous warnings before testing warnings on suite (may be a bug in octave) Modified Paths: -------------- trunk/octave-forge/main/statistics/inst/linkage.m Modified: trunk/octave-forge/main/statistics/inst/linkage.m =================================================================== --- trunk/octave-forge/main/statistics/inst/linkage.m 2012-03-14 22:48:24 UTC (rev 9890) +++ trunk/octave-forge/main/statistics/inst/linkage.m 2012-03-14 22:54:39 UTC (rev 9891) @@ -217,6 +217,7 @@ %!assert (cond (linkage (pdist (x), "complete")), 21.793345,t); %!assert (cond (linkage (pdist (x), "average")), 27.045012,t); %!assert (cond (linkage (pdist (x), "weighted")), 27.412889,t); +%! lastwarn(); # Clear last warning before the test %!warning <monotonically> linkage (pdist (x), "centroid"); %!test warning off clustering %! assert (cond (linkage (pdist (x), "centroid")), 27.457477,t); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |