From: Michael C. <mc...@us...> - 2007-01-24 10:55:57
|
Update of /cvsroot/octave/octave-forge/main/econometrics/inst In directory sc8-pr-cvs3.sourceforge.net:/tmp/cvs-serv23833/main/econometrics/inst Added Files: kernel_density_cvscore.m kernel_density.m kernel_density_nodes.m __kernel_epanechnikov.m kernel_example.m __kernel_normal.m kernel_optimal_bandwidth.m kernel_regression_cvscore.m kernel_regression.m kernel_regression_nodes.m Log Message: initial commit of functions for nonparametric smoothing. Includes kernel regression and density. Can make use of MPITB for parallel computing if it's available. --- NEW FILE: kernel_density.m --- # Copyright (C) 2006 Michael Creel <mic...@ua...> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # kernel_density: multivariate kernel density estimator # # usage: # dens = kernel_density(eval_points, data, bandwidth) # # inputs: # eval_points: PxK matrix of points at which to calculate the density # data: NxK matrix of data points # bandwidth: positive scalar, the smoothing parameter. The fit # is more smooth as the bandwidth increases. # do_cv: bool (optional). default false. If true, calculate leave-1-out # density for cross validation # nslaves: int (optional, default 0). Number of compute nodes for parallel evaluation # debug: bool (optional, default false). show results on compute nodes if doing # parallel run # bandwith_matrix (optional): nonsingular KxK matrix. Rotates data. # Default is Choleski decomposition of inverse of covariance, # to approximate independence after the transformation, which # makes a product kernel a reasonable choice. # kernel (optional): string. Name of the kernel function. Default is radial # symmetric Epanechnikov kernel. # outputs: # dens: Px1 vector: the fitted density value at each of the P evaluation points. # # References: # Wand, M.P. and Jones, M.C. (1995), 'Kernel smoothing'. # http://www.xplore-stat.de/ebooks/scripts/spm/html/spmhtmlframe73.html function z = kernel_density(eval_points, data, bandwidth, do_cv, nslaves, debug, bandwith_matrix, kernel) if nargin < 3; error("kernel_density: at least 3 arguments are required"); endif # set defaults for optional args # default ordinary density, not leave-1-out if (nargin < 4) do_cv = false; endif # default serial if (nargin < 5) nslaves = 0; endif # debug or not (default) if (nargin < 6) debug = false; endif; # default bandwidth matrix (up to factor of proportionality) if (nargin < 7) bandwidth_matrix = chol(cov(data)); endif # default bandwidth matrix # default kernel if (nargin < 8) kernel = "__kernel_epanechnikov"; endif # default kernel nn = rows(eval_points); n = rows(data); # Inverse bandwidth matrix H_inv H = bandwidth_matrix*bandwidth; H_inv = inv(H); # weight by inverse bandwidth matrix eval_points = eval_points*H_inv; data = data*H_inv; # check if doing this parallel or serial global PARALLEL NSLAVES NEWORLD NSLAVES TAG PARALLEL = 0; if nslaves > 0 PARALLEL = 1; NSLAVES = nslaves; LAM_Init(nslaves, debug); endif if !PARALLEL # ordinary serial version points_per_node = nn; # do the all on this node z = kernel_density_nodes(eval_points, data, do_cv, kernel, points_per_node, nslaves, debug); else # parallel version z = zeros(nn,1); points_per_node = floor(nn/(NSLAVES + 1)); # number of obsns per slave # The command that the slave nodes will execute cmd=['z_on_node = kernel_density_nodes(eval_points, data, do_cv, kernel, points_per_node, nslaves, debug); ',... 'MPI_Send(z_on_node, 0, TAG, NEWORLD);']; # send items to slaves NumCmds_Send({"eval_points", "data", "do_cv", "kernel", "points_per_node", "nslaves", "debug","cmd"}, {eval_points, data, do_cv, kernel, points_per_node, nslaves, debug, cmd}); # evaluate last block on master while slaves are busy z_on_node = kernel_density_nodes(eval_points, data, do_cv, kernel, points_per_node, nslaves, debug); startblock = NSLAVES*points_per_node + 1; endblock = nn; z(startblock:endblock,:) = z(startblock:endblock,:) + z_on_node; # collect slaves' results z_on_node = zeros(points_per_node,1); # size may differ between master and compute nodes - reset here for i = 1:NSLAVES MPI_Recv(z_on_node,i,TAG,NEWORLD); startblock = i*points_per_node - points_per_node + 1; endblock = i*points_per_node; z(startblock:endblock,:) = z(startblock:endblock,:) + z_on_node; endfor # clean up after parallel LAM_Finalize; endif z = z*det(H_inv); endfunction --- NEW FILE: kernel_optimal_bandwidth.m --- function bandwidth = kernel_optimal_bandwidth(data, depvar) # SA controls ub = 1; lb = -5; nt = 1; ns = 1; rt = 0.05; maxevals = 50; neps = 5; functol = 1e-2; paramtol = 1e-3; verbosity = 0; minarg = 1; sa_control = { lb, ub, nt, ns, rt, maxevals, neps, functol, paramtol, verbosity, 1}; # bfgs controls bfgs_control = {100,0,0}; if (nargin == 1) bandwidth = samin("kernel_density_cvscore", {0, data}, sa_control); bandwidth = bfgsmin("kernel_density_cvscore", {bandwidth, data}, bfgs_control); else bandwidth = samin("kernel_regression_cvscore", {0, data, depvar}, sa_control); bandwidth = bfgsmin("kernel_regression_cvscore", {bandwidth, data, depvar}, bfgs_control); endif bandwidth = exp(bandwidth); endfunction --- NEW FILE: kernel_example.m --- # Copyright (C) 2007 Michael Creel <mic...@ua...> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # kernel_example: examples of how to use kernel density and regression functions # # usage: kernel_example; # sample size - should get better fit (on average) as this increases n = 500; # set this to greater than 0 to try parallel computations (requires MPITB) compute_nodes = 0; nodes = compute_nodes + 1; # count master node ############################################################ # kernel regression example # uniformly spaced data points on [0,2] x = 1:n; x = x'; x = 2*x/n; # generate dependent variable trueline = x + (x.^2)/2 - 3.1*(x.^3)/3 + 1.2*(x.^4)/4; sig = 0.5; y = trueline + sig*randn(n,1); # default bandwidth bandwidth = 0.45; # get optimal bandwidth (time consuming, uncomment if you want to try it) # bandwidth = kernel_optimal_bandwidth(x, y); # get the fit and do the plot tic; fit = kernel_regression(x, y, x, bandwidth, false, compute_nodes); t1 = toc; printf("\n"); printf("########################################################################\n"); printf("Kernel regression example\n"); printf("time using %d data points and %d compute nodes: %f\n", n, nodes, t1); grid("on"); title("Example 1: Kernel regression fit"); plot(x, fit, x, trueline); ############################################################ # kernel density example: univariate - fit to Chi^2(3) data data = sumsq(randn(n,3),2); # evaluation point are on a grid for plotting stepsize = 0.2; grid_x = (-1:stepsize:11)'; bandwidth = 0.55; # get optimal bandwidth (time consuming, uncomment if you want to try it) # bandwidth = kernel_optimal_bandwidth(data); # get the fitted density and do a plot tic; dens = kernel_density(grid_x, data, bandwidth, false, compute_nodes); t1 = toc; printf("\n"); printf("########################################################################\n"); printf("Univariate kernel density example\n"); printf("time using %d data points and %d compute nodes: %f\n", n, nodes, t1); printf("A rough integration under the fitted univariate density is %f\n", sum(dens)*stepsize); figure(); title("Example 2: Kernel density fit: Univariate Chi^2(3) data"); xlabel("true density is Chi^2(3)"); plot(grid_x, [dens chisquare_pdf(grid_x,3)]); ############################################################ # kernel density example: bivariate # X ~ N(0,1) # Y ~ Chi squared(3) # X, Y are dependent d = randn(n,3); data = [d(:,1) sumsq(d,2)]; # evaluation points are on a grid for plotting stepsize = 0.2; a = (-5:stepsize:5)'; # for the N(0,1) b = (-1:stepsize:9)'; # for the Chi squared(3) gridsize = rows(a); [grid_x, grid_y] = meshgrid(a, b); eval_points = [vec(grid_x) vec(grid_y)]; bandwidth = 0.85; # get optimal bandwidth (time consuming, uncomment if you want to try it) # bandwidth = kernel_optimal_bandwidth(data); # get the fitted density and do a plot tic; dens = kernel_density(eval_points, data, bandwidth, false, compute_nodes); t1 = toc; printf("\n"); printf("########################################################################\n"); printf("Multivatiate kernel density example\n"); printf("time using %d data points and %d compute nodes: %f\n", n, nodes, t1); dens = reshape(dens, gridsize, gridsize); printf("A rough integration under the fitted bivariate density is %f\n", sum(sum(dens))*stepsize^2); figure(); legend("off"); title("Example 3: Kernel density fit: dependent bivatiate data"); xlabel("true marginal density is N(0,1)"); ylabel("true marginal density is Chi^2(3)"); surf(grid_x, grid_y, dens); # more extensive test of parallel if compute_nodes > 0 # only try this if parallel is available ############################################################ # kernel density example: bivariate # X ~ N(0,1) # Y ~ Chi squared(3) # X, Y are dependent # evaluation points are on a grid for plotting stepsize = 0.2; a = (-5:stepsize:5)'; # for the N(0,1) b = (-1:stepsize:9)'; # for the Chi squared(3) gridsize = rows(a); [grid_x, grid_y] = meshgrid(a, b); eval_points = [vec(grid_x) vec(grid_y)]; bandwidth = 0.85; ns = [1000; 2000; 4000; 8000]; printf("\n"); printf("########################################################################\n"); printf("multivariate kernel density example with several sample sizes serial/parallel timings\n"); for i = 1:rows(ns) for compute_nodes = 0:1 nodes = compute_nodes + 1; n = ns(i,:); d = randn(n,3); data = [d(:,1) sumsq(d,2)]; tic; dens = kernel_density(eval_points, data, bandwidth, false, compute_nodes); t1 = toc; printf(" %d data points and %d compute nodes: %f\n", n, nodes, t1); endfor endfor endif --- NEW FILE: kernel_density_cvscore.m --- function cvscore = kernel_density_cvscore(bandwidth, data, depvar) dens = kernel_density(data, data, exp(bandwidth), true); dens = dens + eps; # some kernels can assign zero density cvscore = -mean(log(dens)); endfunction --- NEW FILE: kernel_regression_nodes.m --- # Copyright (C) 2006 Michael Creel <mic...@ua...> # under the terms of the GNU General Public License. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # Kernel regression, parallel version: do calculations on nodes function z = kernel_regression_nodes(eval_points, data, do_cv, kernel, points_per_node, nslaves, debug) if (nslaves > 0) global NEWORLD [info, myrank] = MPI_Comm_rank(NEWORLD); else myrank = 0; # if not parallel then do all on master node endif if myrank == 0 # Do this if I'm master startblock = nslaves*points_per_node + 1; endblock = rows(eval_points); else # this is for the slaves startblock = myrank*points_per_node - points_per_node + 1; endblock = myrank*points_per_node; endif # the block of eval_points this node does myeval = eval_points(startblock:endblock,:); nn = rows(myeval); n = rows(data); y = data(:,1); data = data(:,2:columns(data)); # calculate distances # note to self: loop is as fast as double kronecker with # reshape. Also, simpler and safe in terms of memory. z = zeros(nn,1); for i = 1:nn zz = data - kron(myeval(i,:), ones(n,1)); zz = feval(kernel, zz); if (do_cv) zz(i,:) = 0; endif temp = sum(zz); if (temp > 0) zz = zz / temp; endif z(i,:) = zz'*y; endfor if debug printf("z on node %d: \n", myrank); z' endif endfunction --- NEW FILE: kernel_regression.m --- # Copyright (C) 2006 Michael Creel <mic...@ua...> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # kernel_density: kernel regression estimator # # usage: # dens = kernel_regression(eval_points, depvar, condvars, bandwidth) # # inputs: # eval_points: PxK matrix of points at which to calculate the density # dev_var: Nx1 vector of observations of the dependent variable # condvars: NxK matrix. N observations on the K conditioning variables. # bandwidth: positive scalar, the smoothing parameter. The fit # is more smooth as the bandwidth increases. # do_cv: bool (optional). default false. If true, calculate leave-1-out # density for cross validation # nslaves: int (optional, default 0). Number of compute nodes for parallel evaluation # debug: bool (optional, default false). show results on compute nodes if doing # parallel run # bandwith_matrix (optional): nonsingular KxK matrix. Rotates data. # Default is Choleski decomposition of inverse of covariance, # to approximate independence after the transformation, which # makes a product kernel a reasonable choice. # kernel (optional): string. Name of the kernel function. Default is radial # symmetric Epanechnikov kernel. # outputs: # dens: Px1 vector: the fitted density value at each of the P evaluation points. # function z = kernel_regression(eval_points, depvar, condvars, bandwidth, do_cv, nslaves, debug, bandwith_matrix, kernel) if nargin < 4; error("kernel_regression: at least 4 arguments are required"); endif # set defaults for optional args # default ordinary density, not leave-1-out if (nargin < 5) do_cv = false; endif # default serial if (nargin < 6) nslaves = 0; endif # debug or not (default) if (nargin < 7) debug = false; endif; # default bandwidth matrix (up to factor of proportionality) if (nargin < 8) bandwidth_matrix = chol(cov(condvars)); endif # default bandwidth matrix # default kernel if (nargin < 9) kernel = "__kernel_epanechnikov"; endif # default kernel nn = rows(eval_points); n = rows(depvar); # Inverse bandwidth matrix H_inv H = bandwidth_matrix*bandwidth; H_inv = inv(H); # weight by inverse bandwidth matrix eval_points = eval_points*H_inv; condvars = condvars*H_inv; data = [depvar condvars]; # put it all together for sending to nodes # check if doing this parallel or serial global PARALLEL NSLAVES NEWORLD NSLAVES TAG PARALLEL = 0; if nslaves > 0 PARALLEL = 1; NSLAVES = nslaves; LAM_Init(nslaves, debug); endif if !PARALLEL # ordinary serial version points_per_node = nn; # do the all on this node z = kernel_regression_nodes(eval_points, data, do_cv, kernel, points_per_node, nslaves, debug); else # parallel version z = zeros(nn,1); points_per_node = floor(nn/(NSLAVES + 1)); # number of obsns per slave # The command that the slave nodes will execute cmd=['z_on_node = kernel_regression_nodes(eval_points, data, do_cv, kernel, points_per_node, nslaves, debug); ',... 'MPI_Send(z_on_node, 0, TAG, NEWORLD);']; # send items to slaves NumCmds_Send({"eval_points", "data", "do_cv", "kernel", "points_per_node", "nslaves", "debug","cmd"}, {eval_points, data, do_cv, kernel, points_per_node, nslaves, debug, cmd}); # evaluate last block on master while slaves are busy z_on_node = kernel_regression_nodes(eval_points, data, do_cv, kernel, points_per_node, nslaves, debug); startblock = NSLAVES*points_per_node + 1; endblock = nn; z(startblock:endblock,:) = z(startblock:endblock,:) + z_on_node; # collect slaves' results z_on_node = zeros(points_per_node,1); # size may differ between master and compute nodes - reset here for i = 1:NSLAVES MPI_Recv(z_on_node,i,TAG,NEWORLD); startblock = i*points_per_node - points_per_node + 1; endblock = i*points_per_node; z(startblock:endblock,:) = z(startblock:endblock,:) + z_on_node; endfor # clean up after parallel LAM_Finalize; endif endfunction --- NEW FILE: __kernel_epanechnikov.m --- # Copyright (C) 2006 Michael Creel <mic...@ua...> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # __kernel_epanechnikov: this function is for internal use by kernel_density # and kernel_regression # # multivariate spherical Epanechnikov kernel # input: PxK matrix - P data points, each of which is in R^K # output: Px1 vector, input matrix passed though the kernel # other multivariate kernel functions should follow this convention function z = __kernel_epanechnikov(z) K = columns(z); # Volume of d-dimensional unit sphere c = pi ^ (K/2) / gamma(K/2 + 1); # compute kernel z = sumsq(z, 2); z = ((1/2) / c * (K + 2) * (1 - z)) .* (z < 1); endfunction --- NEW FILE: kernel_density_nodes.m --- # Copyright (C) 2006 Michael Creel <mic...@ua...> # under the terms of the GNU General Public License. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # Kernel density, parallel version: do calculations on nodes function z = kernel_density_nodes(eval_points, data, do_cv, kernel, points_per_node, nslaves, debug) if (nslaves > 0) global NEWORLD [info, myrank] = MPI_Comm_rank(NEWORLD); else myrank = 0; # if not parallel then do all on master node endif if myrank == 0 # Do this if I'm master startblock = nslaves*points_per_node + 1; endblock = rows(eval_points); else # this is for the slaves startblock = myrank*points_per_node - points_per_node + 1; endblock = myrank*points_per_node; endif # the block of eval_points this node does myeval = eval_points(startblock:endblock,:); nn = rows(myeval); n = rows(data); # calculate distances # note to self: loop is as fast as double kronecker with # reshape. Also, simpler and safe in terms of memory. z = zeros(nn,1); for i = 1:nn zz = data - kron(myeval(i,:), ones(n,1)); zz = feval(kernel, zz); z(i) = sum(zz); if (do_cv) z(i) = z(i) - zz(i); endif endfor if (do_cv) z = z / (n-1); else z = z/n; endif if debug printf("z on node %d: \n", myrank); z' endif endfunction --- NEW FILE: __kernel_normal.m --- # Copyright (C) 2006 Michael Creel <mic...@ua...> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # __kernel_normal: this function is for internal use by kernel_density # and kernel_regression # # product normal kernel # input: PxK matrix - P data points, each of which is in R^K # output: Px1 vector, input matrix passed though the kernel # other multivariate kernel functions should follow this convention function z = __kernel_normal(z) z = normal_pdf(z); z = prod(z,2); endfunction --- NEW FILE: kernel_regression_cvscore.m --- function cvscore = kernel_regression_cvscore(bandwidth, data, depvar) fit = kernel_regression(data, depvar, data, exp(bandwidth), true); cvscore = norm(depvar - fit); endfunction |