[Octave-cvsupdate] octave-forge/main/econometrics/inst kernel_density_cvscore.m, NONE, 1.1 kernel_d

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Update of /cvsroot/octave/octave-forge/main/econometrics/inst
In directory sc8-pr-cvs3.sourceforge.net:/tmp/cvs-serv23833/main/econometrics/inst

Added Files:
	kernel_density_cvscore.m kernel_density.m 
	kernel_density_nodes.m __kernel_epanechnikov.m 
	kernel_example.m __kernel_normal.m kernel_optimal_bandwidth.m 
	kernel_regression_cvscore.m kernel_regression.m 
	kernel_regression_nodes.m 
Log Message:
initial commit of functions for nonparametric smoothing. Includes kernel regression and density. Can make use of MPITB for parallel computing if it's available.

--- NEW FILE: kernel_density.m ---
# Copyright (C) 2006 Michael Creel <mic...@ua...>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# kernel_density: multivariate kernel density estimator
#
# usage:
# 	dens = kernel_density(eval_points, data, bandwidth)
#
# inputs:
#	eval_points: PxK matrix of points at which to calculate the density
# 	data: NxK matrix of data points
#	bandwidth: positive scalar, the smoothing parameter. The fit
# 		is more smooth as the bandwidth increases.
#	do_cv: bool (optional). default false. If true, calculate leave-1-out
#		 density for cross validation
#	nslaves: int (optional, default 0). Number of compute nodes for parallel evaluation
#	debug: bool (optional, default false). show results on compute nodes if doing
#		 parallel run
#	bandwith_matrix (optional): nonsingular KxK matrix. Rotates data.
# 		Default is Choleski decomposition of inverse of covariance,
#		to approximate independence after the transformation, which
#		makes a product kernel a reasonable choice.
#	kernel (optional): string. Name of the kernel function. Default is radial
#		symmetric Epanechnikov kernel.
# outputs:
#	dens: Px1 vector: the fitted density value at each of the P evaluation points.
#
# References:
# Wand, M.P. and Jones, M.C. (1995), 'Kernel smoothing'.
# http://www.xplore-stat.de/ebooks/scripts/spm/html/spmhtmlframe73.html

function z = kernel_density(eval_points, data, bandwidth, do_cv, nslaves, debug, bandwith_matrix, kernel)

	if nargin < 3; error("kernel_density: at least 3 arguments are required"); endif

	# set defaults for optional args
	# default ordinary density, not leave-1-out
	if (nargin < 4)	do_cv = false; endif
	# default serial
	if (nargin < 5)	nslaves = 0; endif
	# debug or not (default)
	if (nargin < 6)	debug = false; endif;
	# default bandwidth matrix (up to factor of proportionality)
	if (nargin < 7) bandwidth_matrix = chol(cov(data)); endif # default bandwidth matrix
	# default kernel
	if (nargin < 8) kernel = "__kernel_epanechnikov"; endif 	# default kernel

	nn = rows(eval_points);
	n = rows(data);

	# Inverse bandwidth matrix H_inv
	H = bandwidth_matrix*bandwidth;
	H_inv = inv(H);

	# weight by inverse bandwidth matrix
	eval_points = eval_points*H_inv;
	data = data*H_inv;

	# check if doing this parallel or serial
	global PARALLEL NSLAVES NEWORLD NSLAVES TAG
	PARALLEL = 0;

	if nslaves > 0
		PARALLEL = 1;
		NSLAVES = nslaves;
		LAM_Init(nslaves, debug);
	endif

	if !PARALLEL # ordinary serial version
		points_per_node = nn; # do the all on this node
		z = kernel_density_nodes(eval_points, data, do_cv, kernel, points_per_node, nslaves, debug);
	else # parallel version
		z = zeros(nn,1);
		points_per_node = floor(nn/(NSLAVES + 1)); # number of obsns per slave
		# The command that the slave nodes will execute
		cmd=['z_on_node = kernel_density_nodes(eval_points, data, do_cv, kernel, points_per_node, nslaves, debug); ',...
		'MPI_Send(z_on_node, 0, TAG, NEWORLD);'];

		# send items to slaves

		NumCmds_Send({"eval_points", "data", "do_cv", "kernel", "points_per_node", "nslaves", "debug","cmd"}, {eval_points, data, do_cv, kernel, points_per_node, nslaves, debug, cmd});

		# evaluate last block on master while slaves are busy
		z_on_node = kernel_density_nodes(eval_points, data, do_cv, kernel, points_per_node, nslaves, debug);
		startblock = NSLAVES*points_per_node + 1;
		endblock = nn;
		z(startblock:endblock,:) = z(startblock:endblock,:) + z_on_node;

		# collect slaves' results
		z_on_node = zeros(points_per_node,1); # size may differ between master and compute nodes - reset here
		for i = 1:NSLAVES
			MPI_Recv(z_on_node,i,TAG,NEWORLD);
			startblock = i*points_per_node - points_per_node + 1;
			endblock = i*points_per_node;
			z(startblock:endblock,:) = z(startblock:endblock,:) + z_on_node;
		endfor

		# clean up after parallel
		LAM_Finalize;
	endif
	z = z*det(H_inv);
endfunction

--- NEW FILE: kernel_optimal_bandwidth.m ---
function bandwidth = kernel_optimal_bandwidth(data, depvar)
	# SA controls
	ub = 1;
	lb = -5;
	nt = 1;
	ns = 1;
	rt = 0.05;
	maxevals = 50;
	neps = 5;
	functol = 1e-2;
	paramtol = 1e-3;
	verbosity = 0;
	minarg = 1;
	sa_control = { lb, ub, nt, ns, rt, maxevals, neps, functol, paramtol, verbosity, 1};

	# bfgs controls
	bfgs_control = {100,0,0};

	if (nargin == 1)
		bandwidth = samin("kernel_density_cvscore", {0, data}, sa_control);
		bandwidth = bfgsmin("kernel_density_cvscore", {bandwidth, data}, bfgs_control);
	else
		bandwidth = samin("kernel_regression_cvscore", {0, data, depvar}, sa_control);
		bandwidth = bfgsmin("kernel_regression_cvscore", {bandwidth, data, depvar}, bfgs_control);
	endif
	bandwidth = exp(bandwidth);
endfunction

--- NEW FILE: kernel_example.m ---
# Copyright (C) 2007 Michael Creel <mic...@ua...>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# kernel_example: examples of how to use kernel density and regression functions
#
# usage: kernel_example;

# sample size - should get better fit (on average) as this increases
n = 500;

# set this to greater than 0 to try parallel computations (requires MPITB)
compute_nodes = 0;

nodes = compute_nodes + 1; # count master node

############################################################
# kernel regression example
# uniformly spaced data points on [0,2]
x = 1:n;
x = x';
x = 2*x/n;
# generate dependent variable
trueline =  x + (x.^2)/2 - 3.1*(x.^3)/3 + 1.2*(x.^4)/4;
sig = 0.5;
y = trueline + sig*randn(n,1);
# default bandwidth
bandwidth = 0.45;
# get optimal bandwidth (time consuming, uncomment if you want to try it)
# bandwidth = kernel_optimal_bandwidth(x, y);
# get the fit and do the plot
tic;
fit = kernel_regression(x, y, x, bandwidth, false, compute_nodes);
t1 = toc;
printf("\n");
printf("########################################################################\n");
printf("Kernel regression example\n");
printf("time using %d data points and %d compute nodes: %f\n", n, nodes, t1);
grid("on");
title("Example 1: Kernel regression fit");
plot(x, fit, x, trueline);

############################################################
# kernel density example: univariate - fit to Chi^2(3) data
data = sumsq(randn(n,3),2);
# evaluation point are on a grid for plotting
stepsize = 0.2;
grid_x = (-1:stepsize:11)';
bandwidth = 0.55;
# get optimal bandwidth (time consuming, uncomment if you want to try it)
# bandwidth = kernel_optimal_bandwidth(data);
# get the fitted density and do a plot
tic;
dens = kernel_density(grid_x, data, bandwidth, false, compute_nodes);
t1 = toc;
printf("\n");
printf("########################################################################\n");
printf("Univariate kernel density example\n");
printf("time using %d data points and %d compute nodes: %f\n", n, nodes, t1);
printf("A rough integration under the fitted univariate density is %f\n", sum(dens)*stepsize);
figure();
title("Example 2: Kernel density fit: Univariate Chi^2(3) data");
xlabel("true density is Chi^2(3)");
plot(grid_x, [dens chisquare_pdf(grid_x,3)]);

############################################################
# kernel density example: bivariate
# X ~ N(0,1)
# Y ~ Chi squared(3)
# X, Y are dependent
d = randn(n,3);
data = [d(:,1) sumsq(d,2)];
# evaluation points are on a grid for plotting
stepsize = 0.2;
a = (-5:stepsize:5)'; # for the N(0,1)
b = (-1:stepsize:9)';  # for the Chi squared(3)
gridsize = rows(a);
[grid_x, grid_y] = meshgrid(a, b);
eval_points = [vec(grid_x) vec(grid_y)];
bandwidth = 0.85;
# get optimal bandwidth (time consuming, uncomment if you want to try it)
# bandwidth = kernel_optimal_bandwidth(data);
# get the fitted density and do a plot
tic;
dens = kernel_density(eval_points, data, bandwidth, false, compute_nodes);
t1 = toc;
printf("\n");
printf("########################################################################\n");
printf("Multivatiate kernel density example\n");
printf("time using %d data points and %d compute nodes: %f\n", n, nodes, t1);
dens = reshape(dens, gridsize, gridsize);
printf("A rough integration under the fitted bivariate density is %f\n", sum(sum(dens))*stepsize^2);
figure();
legend("off");
title("Example 3: Kernel density fit: dependent bivatiate data");
xlabel("true marginal density is N(0,1)");
ylabel("true marginal density is Chi^2(3)");
surf(grid_x, grid_y, dens);

# more extensive test of parallel
if compute_nodes > 0 # only try this if parallel is available
	############################################################
	# kernel density example: bivariate
	# X ~ N(0,1)
	# Y ~ Chi squared(3)
	# X, Y are dependent
	# evaluation points are on a grid for plotting
	stepsize = 0.2;
	a = (-5:stepsize:5)'; # for the N(0,1)
	b = (-1:stepsize:9)';  # for the Chi squared(3)
	gridsize = rows(a);
	[grid_x, grid_y] = meshgrid(a, b);
	eval_points = [vec(grid_x) vec(grid_y)];
	bandwidth = 0.85;
	ns = [1000; 2000; 4000; 8000];
	printf("\n");
	printf("########################################################################\n");
	printf("multivariate kernel density example with several sample sizes serial/parallel timings\n");
	for i = 1:rows(ns)
		for compute_nodes = 0:1
			nodes = compute_nodes + 1;
			n = ns(i,:);
			d = randn(n,3);
			data = [d(:,1) sumsq(d,2)];
			tic;
			dens = kernel_density(eval_points, data, bandwidth, false, compute_nodes);
			t1 = toc;
			printf(" %d data points and %d compute nodes: %f\n", n, nodes, t1);
		endfor
	endfor
endif
--- NEW FILE: kernel_density_cvscore.m ---
function cvscore = kernel_density_cvscore(bandwidth, data, depvar)
		dens = kernel_density(data, data, exp(bandwidth), true);
		dens = dens + eps; # some kernels can assign zero density
		cvscore = -mean(log(dens));
endfunction

--- NEW FILE: kernel_regression_nodes.m ---
# Copyright (C) 2006  Michael Creel <mic...@ua...>
# under the terms of the GNU General Public License.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# Kernel regression, parallel version: do calculations on nodes

function z = kernel_regression_nodes(eval_points, data, do_cv, kernel, points_per_node, nslaves, debug)

	if (nslaves > 0)
		global NEWORLD
		[info, myrank] = MPI_Comm_rank(NEWORLD);
	else myrank = 0; # if not parallel then do all on master node
	endif

	if myrank == 0 # Do this if I'm master
		startblock = nslaves*points_per_node + 1;
		endblock = rows(eval_points);
	else	# this is for the slaves
		startblock = myrank*points_per_node - points_per_node + 1;
		endblock = myrank*points_per_node;
	endif

	# the block of eval_points this node does
	myeval = eval_points(startblock:endblock,:);
	nn = rows(myeval);
	n = rows(data);

	y = data(:,1);
	data = data(:,2:columns(data));

	# calculate distances
	# note to self: loop is as fast as double kronecker with
	# reshape. Also, simpler and safe in terms of memory.
	z = zeros(nn,1);
	for i = 1:nn
		zz = data - kron(myeval(i,:), ones(n,1));
		zz = feval(kernel, zz);
		if (do_cv) zz(i,:) = 0; endif
		temp = sum(zz);
		if (temp > 0) zz = zz / temp; endif
		z(i,:) = zz'*y;
	endfor

	if debug
		printf("z on node %d: \n", myrank);
		z'
	endif
endfunction

--- NEW FILE: kernel_regression.m ---
# Copyright (C) 2006 Michael Creel <mic...@ua...>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# kernel_density: kernel regression estimator
#
# usage:
# 	dens = kernel_regression(eval_points, depvar, condvars, bandwidth)
#
# inputs:
#	eval_points: PxK matrix of points at which to calculate the density
#	dev_var: Nx1 vector of observations of the dependent variable
# 	condvars: NxK matrix. N observations on the K conditioning variables.
#	bandwidth: positive scalar, the smoothing parameter. The fit
# 		is more smooth as the bandwidth increases.
#	do_cv: bool (optional). default false. If true, calculate leave-1-out
#		density for cross validation
#	nslaves: int (optional, default 0). Number of compute nodes for parallel evaluation
#	debug: bool (optional, default false). show results on compute nodes if doing
#		 parallel run
#	bandwith_matrix (optional): nonsingular KxK matrix. Rotates data.
# 		Default is Choleski decomposition of inverse of covariance,
#		to approximate independence after the transformation, which
#		makes a product kernel a reasonable choice.
#	kernel (optional): string. Name of the kernel function. Default is radial
#		symmetric Epanechnikov kernel.
# outputs:
#	dens: Px1 vector: the fitted density value at each of the P evaluation points.
#

function z = kernel_regression(eval_points, depvar, condvars, bandwidth, do_cv, nslaves, debug, bandwith_matrix, kernel)

	if nargin < 4; error("kernel_regression: at least 4 arguments are required"); endif

	# set defaults for optional args
	# default ordinary density, not leave-1-out
	if (nargin < 5)	do_cv = false; endif
	# default serial
	if (nargin < 6)	nslaves = 0; endif
	# debug or not (default)
	if (nargin < 7)	debug = false; endif;
	# default bandwidth matrix (up to factor of proportionality)
	if (nargin < 8) bandwidth_matrix = chol(cov(condvars)); endif # default bandwidth matrix
	# default kernel
	if (nargin < 9) kernel = "__kernel_epanechnikov"; endif 	# default kernel

	nn = rows(eval_points);
	n = rows(depvar);

	# Inverse bandwidth matrix H_inv
	H = bandwidth_matrix*bandwidth;
	H_inv = inv(H);

	# weight by inverse bandwidth matrix
	eval_points = eval_points*H_inv;
	condvars = condvars*H_inv;

	data = [depvar condvars]; # put it all together for sending to nodes

	# check if doing this parallel or serial
	global PARALLEL NSLAVES NEWORLD NSLAVES TAG
	PARALLEL = 0;

	if nslaves > 0
		PARALLEL = 1;
		NSLAVES = nslaves;
		LAM_Init(nslaves, debug);
	endif

	if !PARALLEL # ordinary serial version
		points_per_node = nn; # do the all on this node
		z = kernel_regression_nodes(eval_points, data, do_cv, kernel, points_per_node, nslaves, debug);
	else # parallel version
		z = zeros(nn,1);
		points_per_node = floor(nn/(NSLAVES + 1)); # number of obsns per slave
		# The command that the slave nodes will execute
		cmd=['z_on_node = kernel_regression_nodes(eval_points, data, do_cv, kernel, points_per_node, nslaves, debug); ',...
		'MPI_Send(z_on_node, 0, TAG, NEWORLD);'];

		# send items to slaves

		NumCmds_Send({"eval_points", "data", "do_cv", "kernel", "points_per_node", "nslaves", "debug","cmd"}, {eval_points, data, do_cv, kernel, points_per_node, nslaves, debug, cmd});

		# evaluate last block on master while slaves are busy
		z_on_node = kernel_regression_nodes(eval_points, data, do_cv, kernel, points_per_node, nslaves, debug);
		startblock = NSLAVES*points_per_node + 1;
		endblock = nn;
		z(startblock:endblock,:) = z(startblock:endblock,:) + z_on_node;

		# collect slaves' results
		z_on_node = zeros(points_per_node,1); # size may differ between master and compute nodes - reset here
		for i = 1:NSLAVES
			MPI_Recv(z_on_node,i,TAG,NEWORLD);
			startblock = i*points_per_node - points_per_node + 1;
			endblock = i*points_per_node;
			z(startblock:endblock,:) = z(startblock:endblock,:) + z_on_node;
		endfor

		# clean up after parallel
		LAM_Finalize;
	endif
endfunction

--- NEW FILE: __kernel_epanechnikov.m ---
# Copyright (C) 2006 Michael Creel <mic...@ua...>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# __kernel_epanechnikov: this function is for internal use by kernel_density
# and kernel_regression
#
# multivariate spherical Epanechnikov kernel
# input: PxK matrix - P data points, each of which is in R^K
# output: Px1 vector, input matrix passed though the kernel
# other multivariate kernel functions should follow this convention

function z = __kernel_epanechnikov(z)

	K = columns(z);

	# Volume of d-dimensional unit sphere
	c = pi ^ (K/2) / gamma(K/2 + 1);

	# compute kernel
	z  =  sumsq(z, 2);
	z = ((1/2) / c * (K + 2) * (1 - z)) .* (z < 1);

endfunction

--- NEW FILE: kernel_density_nodes.m ---
# Copyright (C) 2006  Michael Creel <mic...@ua...>
# under the terms of the GNU General Public License.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# Kernel density, parallel version: do calculations on nodes

function z = kernel_density_nodes(eval_points, data, do_cv, kernel, points_per_node, nslaves, debug)

	if (nslaves > 0)
		global NEWORLD
		[info, myrank] = MPI_Comm_rank(NEWORLD);
	else myrank = 0; # if not parallel then do all on master node
	endif

	if myrank == 0 # Do this if I'm master
		startblock = nslaves*points_per_node + 1;
		endblock = rows(eval_points);
	else	# this is for the slaves
		startblock = myrank*points_per_node - points_per_node + 1;
		endblock = myrank*points_per_node;
	endif

	# the block of eval_points this node does
	myeval = eval_points(startblock:endblock,:);
	nn = rows(myeval);
	n = rows(data);

	# calculate distances
	# note to self: loop is as fast as double kronecker with
	# reshape. Also, simpler and safe in terms of memory.
	z = zeros(nn,1);
	for i = 1:nn
		zz = data - kron(myeval(i,:), ones(n,1));
		zz = feval(kernel, zz);
		z(i) = sum(zz);
		if (do_cv) z(i) = z(i) - zz(i); endif
	endfor

	if (do_cv) z = z / (n-1); else 	z = z/n; endif

	if debug
		printf("z on node %d: \n", myrank);
		z'
	endif
endfunction

--- NEW FILE: __kernel_normal.m ---
# Copyright (C) 2006 Michael Creel <mic...@ua...>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# __kernel_normal: this function is for internal use by kernel_density
# and kernel_regression
#
# product normal kernel
# input: PxK matrix - P data points, each of which is in R^K
# output: Px1 vector, input matrix passed though the kernel
# other multivariate kernel functions should follow this convention

function z = __kernel_normal(z)

	z = normal_pdf(z);
	z = prod(z,2);

endfunction

--- NEW FILE: kernel_regression_cvscore.m ---
function cvscore = kernel_regression_cvscore(bandwidth, data, depvar)
	fit = kernel_regression(data, depvar, data, exp(bandwidth), true);
	cvscore = norm(depvar - fit);
endfunction

[Octave-cvsupdate] octave-forge/main/econometrics/inst kernel_density_cvscore.m, NONE, 1.1 kernel_d

A collection of packages providing extra functionality for GNU Octave