From: Michael C. <mc...@us...> - 2004-09-08 14:10:54
|
Update of /cvsroot/octave/octave-forge/main/optim In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv32493/main/optim Modified Files: bfgsmin.cc bfgsmin_example.m Log Message: Integrate bfgsmin and lbfgsmin into same file so convergence, etc. is same. Complete argument checking (Go ahead - hit me!). Beautify output. Index: bfgsmin.cc =================================================================== RCS file: /cvsroot/octave/octave-forge/main/optim/bfgsmin.cc,v retrieving revision 1.7 retrieving revision 1.8 diff -u -d -r1.7 -r1.8 --- bfgsmin.cc 7 Sep 2004 10:30:02 -0000 1.7 +++ bfgsmin.cc 8 Sep 2004 14:10:43 -0000 1.8 @@ -16,7 +16,7 @@ // bfgsmin // -// bfgs minimization of function of form +// bfgs or limited memory bfgs minimization of function of form // // [value, return_2,..., return_m] = f(arg_1, arg_2,..., arg_n) // @@ -27,62 +27,170 @@ #include <octave/Cell.h> #include <octave/lo-mappers.h> #include <float.h> +#include "error.h" // define argument checks static bool any_bad_argument(const octave_value_list& args) { + + int i; + if (!args(0).is_string()) - { - error("bfgsmin: first argument must be string holding objective function name"); - return true; - } + { + error("bfgsmin: first argument must be string holding objective function name"); + return true; + } + if (!args(1).is_cell()) + { + error("bfgsmin: second argument must cell array of function arguments"); + return true; + } + + + // controls should be 4 or 5 element cell array of integers + if (args.length() > 2) + { + Cell control (args(2).cell_value()); // get them to look at them + if (error_state) + { + error("bfgsmin: 3rd argument must be a cell array of 4 or 5 integers"); + return true; + } + + // there should be 4 or 5 elements, depending on if bfgs or lbfgs + if (((control.length() < 4) || (control.length() > 5))) { - error("bfgsmin: second argument must cell array of function arguments"); - return true; + error("bfgsmin: 3rd argument must be a cell array of 4 or 5 integers"); + return true; } - if (args.length() == 3) + + // special treatment for 1st element - it can be Inf or an integer + if (!xisinf (control(0).double_value())) + { + int tmp = control(0).int_value(); + if (error_state) + { + error("bfgsmin: 3rd argument must be a cell array of 4 or 5 integers"); + return true; + } + } + + // now the other 3 or 4 elements + for (i=1; i < control.length(); i++) + { + int tmp = control(i).int_value(); + if (error_state) + { + error("bfgsmin: 3rd argument must be a cell array of 4 or 5 integers"); + return true; + } + } + } + + + // tolerances should be 3 element cell array of doubles + if (args.length() > 3) + { + // type is cell + Cell tols (args(3).cell_value()); + if (error_state) + { + error("bfgsmin: 4th argument must be a cell array of 3 positive doubles"); + return true; + } + + // there should be 3 elements + if (!(tols.length()==3)) { - if (!args(2).is_cell() || (!(args(2).length() == 4))) - { - error("bfgsmin: 3rd argument, if supplied, must a 4-element cell array of control options"); - return true; - } + error("bfgsmin: 4th argument, if supplied, must a 3-element cell array of tolerances"); + return true; } - if (args.length() == 4) - { - if (!args(3).is_cell() || (!(args(3).length() == 3))) - { - error("bfgsmin: 4th argument, if supplied, must a 3-element cell array of tolerances"); - return true; - } - } + + for (i=0; i<3; i++) + { + double tmp = tols(i).double_value(); + if (error_state) + { + error("bfgsmin: 4th argument must be a cell array of 3 positive doubles"); + return true; + } + + if(tmp < 0) + { + error("bfgsmin: 4th argument must be a cell array of 3 positive doubles"); + return true; + } + } + } return false; } +// this is the lbfgs direction, used if control has 5 elements +ColumnVector lbfgs_recursion(int memory, Matrix& sigmas, Matrix& gammas, ColumnVector d) +{ + if (memory == 0) + { + const int n = sigmas.columns(); + ColumnVector sig = sigmas.column(n-1); + ColumnVector gam = gammas.column(n-1); + // do conditioning if there is any memory + double cond = gam.transpose()*gam; + if (cond > 0) + { + cond = (sig.transpose()*gam) / cond; + d = cond*d; + } + return d; + } + else + { + const int k = d.rows(); + const int n = sigmas.columns(); + int i, j; + ColumnVector sig = sigmas.column(memory-1); + ColumnVector gam = gammas.column(memory-1); + double rho; + rho = 1.0 / (gam.transpose() * sig); + double alpha; + alpha = rho * (sig.transpose() * d); + d = d - alpha*gam; + d = lbfgs_recursion(memory - 1, sigmas, gammas, d); + d = d + (alpha - rho * gam.transpose() * d) * sig; + } + return d; +} + DEFUN_DLD(bfgsmin, args, , - "bfgsmin: bfgs minimization of a function. See bfgsmin_example.m\n\ +"bfgsmin: bfgs minimization of a function with respect to a column vector.\n\ \n\ -[x, obj, convergence] = bfgsmin(\"f\", {args}, {control}, {tols})\n\ +By default, numeric derivatives are used, but see bfgsmin_example.m\n\ +for how to use analytic derivatives\n\ +\n\ +Usage: [x, obj, convergence] = bfgsmin(\"f\", args, control, tols)\n\ \n\ Arguments:\n\ * \"f\": function name (string)\n\ -* {args}: a cell array that holds all arguments of the function,\n\ -* {control}: an optional cell array of 4 elements\n\ +* args: a cell array that holds all arguments of the function\n\ + The argument with respect to which minimization is done\n\ + MUST be a COLUMN vector\n\ +* control: an optional cell array of 4 or 5 elements\n\ * elem 1: maximum iterations (-1 = infinity (default))\n\ * elem 2: verbosity\n\ - - 0 = no screen output (default)\n\ - - 1 = summary every iteration\n\ - - 2 = only last iteration summary\n\ + 0 = no screen output (default)\n\ + 1 = summary every iteration\n\ + 2 = only final results\n\ * elem 3: convergence criterion\n\ - - 1 = strict (function, gradient and param change) (default)\n\ - - 2 = weak - only function convergence required\n\ + 1 = strict (function, gradient and param change) (default)\n\ + 2 = weak - only function convergence required\n\ * elem 4: arg with respect to which minimization is done\n\ (default = first)\n\ -* {tols}: an optional cell array of 3 elements\n\ + * elem 5: (optimal) Memory limit for lbfgs. If it's a positive integer\n\ + then lbfgs will be use. Otherwise ordinary bfgs is used\n\ +* tols: an optional cell array of 3 elements\n\ * elem 1: function change tolerance, default 1e-12\n\ * elem 2: parameter change tolerance, default 1e-6\n\ * elem 3: gradient tolerance, default 1e-4\n\ @@ -118,10 +226,10 @@ { int nargin = args.length(); if ((nargin < 2) || (nargin > 4)) - { - error("bfgsmin: you must supply 2, 3 or 4 arguments"); - return octave_value_list(); - } + { + error("bfgsmin: you must supply 2, 3 or 4 arguments"); + return octave_value_list(); + } // check the arguments if (any_bad_argument(args)) return octave_value_list(); @@ -138,68 +246,92 @@ int max_iters, verbosity, criterion, minarg; int convergence, iter; - int gradient_failed, i, j, k, conv_fun, conv_param, conv_grad; + int memory, gradient_failed, i, j, k, conv_fun, conv_param, conv_grad; int have_gradient = 0; double func_tol, param_tol, gradient_tol, stepsize, obj_value; double obj_in, last_obj_value, denominator, test; Matrix H, tempmatrix, H1, H2; - ColumnVector thetain, d, g, g_new, p, q; + ColumnVector thetain, d, g, g_new, p, q, sig, gam; + + + // Default values for controls + max_iters = INT_MAX; // no limit on iterations + verbosity = 0; // by default don't report results to screen + criterion = 1; // strong convergence required + minarg = 1; // by default, first arg is one over which we minimize + memory = 0; // use provided controls, if applicable - if (args.length() >= 3) - { - Cell control (args(2).cell_value()); - // if (xisinf (control(0).double_value())) - // max_iters = -1; - //else - max_iters = control(0).int_value(); - if (max_iters == -1) max_iters = INT_MAX; - verbosity = control(1).int_value(); - criterion = control(2).int_value(); - minarg = control(3).int_value(); - } - else - { - // Default values for controls - max_iters = INT_MAX; // no limit on iterations - verbosity = 0; // by default don't report results to screen - criterion = 1; // strong convergence required - minarg = 1; // by default, first arg is one over which we minimize - } - + if (args.length() > 2) + { + Cell control (args(2).cell_value()); + if (xisinf (control(0).double_value())) // this is to allow 'Inf' as first element of control + max_iters = -1; + else + max_iters = control(0).int_value(); + if (max_iters == -1) max_iters = INT_MAX; + verbosity = control(1).int_value(); + criterion = control(2).int_value(); + minarg = control(3).int_value(); + if (control.length() > 4) memory = control(4).int_value(); + } + + + // type checking for minimization parameter done here, since we don't know minarg + // until now + if (!f_args(minarg - 1).is_real_matrix()) + { + error("bfgsmin: minimization must be with respect to a column vector"); + return octave_value_list(); + } + if (f_args(minarg - 1).columns() != 1) + { + error("bfgsmin: minimization must be with respect to a column vector"); + return octave_value_list(); + } + // use provided tolerances, if applicable if (args.length() == 4) - { - Cell tols (args(3).cell_value()); - func_tol = tols(0).double_value(); - param_tol = tols(1).double_value(); - gradient_tol = tols(2).double_value(); - } + { + Cell tols (args(3).cell_value()); + func_tol = tols(0).double_value(); + param_tol = tols(1).double_value(); + gradient_tol = tols(2).double_value(); + } else - { - // Default values for tolerances - func_tol = 1e-12; - param_tol = 1e-6; - gradient_tol = 1e-4; - - } + { + // Default values for tolerances + func_tol = 1e-12; + param_tol = 1e-6; + gradient_tol = 1e-4; + } - step_args(0) = f; + + // arguments for stepsize function + step_args(0) = f; step_args(1) = f_args; step_args(3) = minarg; + // arguments for numgradient g_args(0) = f; g_args(1) = f_args; g_args(2) = minarg; + // arguments for celleval (to calculate objective function value) c_args(0) = f; c_args(1) = f_args; - ColumnVector theta = f_args(minarg - 1).column_vector_value(); - + + // get the minimization argument + ColumnVector theta = f_args(minarg - 1).column_vector_value(); k = theta.rows(); + // containers for items in limited memory version + Matrix sigmas(k,memory); + Matrix gammas(k,memory); + + // initialize things convergence = -1; // if this doesn't change, it means that maxiters were exceeded thetain = theta; @@ -253,9 +385,15 @@ conv_param = -1; conv_grad = -1; - d = -H*g; + if(memory > 0) // lbfgs + { + if (iter < memory) d = lbfgs_recursion(iter, sigmas, gammas, g); + else d = lbfgs_recursion(memory, sigmas, gammas, g); + d = -d; + } + else d = -H*g; // ordinary bfgs - // stepsize: try bfgs direction, then steepest descent if it fails + // stepsize: try (l)bfgs direction, then steepest descent if it fails step_args(2) = d; f_args(minarg - 1) = theta; step_args(1) = f_args; @@ -263,72 +401,79 @@ stepsize = f_return(0).double_value(); obj_value = f_return(1).double_value(); if (stepsize == 0.0) // fall back to steepest descent - { - d = -g; // try steepest descent - step_args(2) = d; - f_return = feval("newtonstep", step_args); - stepsize = f_return(0).double_value(); - obj_value = f_return(1).double_value(); - } + { + d = -g; // try steepest descent + step_args(2) = d; + f_return = feval("newtonstep", step_args); + stepsize = f_return(0).double_value(); + obj_value = f_return(1).double_value(); + } p = stepsize*d; // check normal convergence: all 3 must be satisfied // function convergence if (fabs(last_obj_value) > 1.0) - { - conv_fun = (fabs(obj_value - last_obj_value)/fabs(last_obj_value)) < func_tol; - } + { + conv_fun = (fabs(obj_value - last_obj_value)/fabs(last_obj_value)) < func_tol; + } else - { - conv_fun = fabs(obj_value - last_obj_value) < func_tol; - } + { + conv_fun = fabs(obj_value - last_obj_value) < func_tol; + } // parameter change convergence test = sqrt(theta.transpose() * theta); if (test > 1) - { - conv_param = sqrt(p.transpose() * p) / test < param_tol ; - - } + { + conv_param = sqrt(p.transpose() * p) / test < param_tol ; + + } else - { - conv_param = sqrt(p.transpose() * p) < param_tol; - } + { + conv_param = sqrt(p.transpose() * p) < param_tol; + } // gradient convergence conv_grad = sqrt(g.transpose() * g) < gradient_tol; // Want intermediate results? - if (verbosity == 1) - { - printf("\nBFGSMIN intermediate results: Iteration %d\n",iter); - printf("Stepsize %8.7f\n", stepsize); - if (have_gradient) printf("Using analytic gradient\n"); - else printf("Using numeric gradient\n"); - printf("Objective function value %16.10f\n", last_obj_value); - printf("Function conv %d Param conv %d Gradient conv %d\n", conv_fun, conv_param, conv_grad); - printf(" params gradient change\n"); - for (j = 0; j<k; j++) - { - printf("%8.4f %8.4f %8.4f\n",theta(j),g(j),p(j)); - } - printf("\n"); - } + if ((verbosity > 0) && (verbosity < 2)) + { + printf("\n======================================================\n"); + printf("BFGSMIN intermediate results\n"); + printf("\n"); + if (memory > 0) printf("Using LBFGS, memory is last %d iterations\n",memory); + if (have_gradient) printf("Using analytic gradient\n"); + else printf("Using numeric gradient\n"); + printf("\n"); + printf("------------------------------------------------------\n"); + printf("Function conv %d Param conv %d Gradient conv %d\n", conv_fun, conv_param, conv_grad); + printf("------------------------------------------------------\n"); + printf("Objective function value %g\n", last_obj_value); + printf("Stepsize %g\n", stepsize); + printf("%d iterations\n", iter); + printf("------------------------------------------------------\n"); + printf("\n param gradient change\n"); + for (j = 0; j<k; j++) + { + printf("%8.4f %8.4f %8.4f\n",theta(j),g(j),p(j)); + } + } // Are we done? if (criterion == 1) - { - if (conv_fun && conv_param && conv_grad) - { - convergence = 1; - break; - } - } + { + if (conv_fun && conv_param && conv_grad) + { + convergence = 1; + break; + } + } else if (conv_fun) - { - convergence = 1; - break; - } + { + convergence = 1; + break; + } last_obj_value = obj_value; @@ -337,71 +482,114 @@ // new gradient f_args(minarg - 1) = theta; if (have_gradient) - { - c_args(1) = f_args; - f_return = feval("celleval", c_args); - g_new = f_return(1).column_vector_value(); - } + { + c_args(1) = f_args; + f_return = feval("celleval", c_args); + g_new = f_return(1).column_vector_value(); + } else // use numeric gradient - { - g_args(1) = f_args; - f_return = feval("numgradient", g_args); - tempmatrix = f_return(0).matrix_value(); - g_new = tempmatrix.row(0).transpose(); - } + { + g_args(1) = f_args; + f_return = feval("numgradient", g_args); + tempmatrix = f_return(0).matrix_value(); + g_new = tempmatrix.row(0).transpose(); + } // Check that gradient is ok gradient_failed = 0; // test = 1 means gradient failed for (i=0; i<k;i++) - { - gradient_failed = gradient_failed + xisnan(g_new(i)); - } + { + gradient_failed = gradient_failed + xisnan(g_new(i)); + } - // Hessian update if gradient ok - if (!gradient_failed) - { - q = g_new-g; - g = g_new; - denominator = q.transpose()*p; - if ((fabs(denominator) < DBL_EPSILON)) // reset Hessian if necessary - { - if (verbosity == 1) printf("bfgsmin: Hessian reset\n"); - H = identity_matrix(k,k); - } - else - { - H1 = (1.0+(q.transpose() * H * q) / denominator) / denominator \ - * (p * p.transpose()); - H2 = (p * q.transpose() * H + H*q*p.transpose()); - H2 = H2 / denominator; - H = H + H1 - H2; - } - } - else H = identity_matrix(k,k); // reset hessian if gradient fails - // then try to start again with steepest descent + if (memory == 0) //use bfgs + { + // Hessian update if gradient ok + if (!gradient_failed) + { + q = g_new-g; + g = g_new; + denominator = q.transpose()*p; + if ((fabs(denominator) < DBL_EPSILON)) // reset Hessian if necessary + { + if (verbosity == 1) printf("bfgsmin: Hessian reset\n"); + H = identity_matrix(k,k); + } + else + { + H1 = (1.0+(q.transpose() * H * q) / denominator) / denominator \ + * (p * p.transpose()); + H2 = (p * q.transpose() * H + H*q*p.transpose()); + H2 = H2 / denominator; + H = H + H1 - H2; + } + } + else H = identity_matrix(k,k); // reset hessian if gradient fails + // then try to start again with steepest descent + } + else // use lbfgs + { + // save components for Hessian if gradient ok + if (!gradient_failed) + { + sig = p; // change in parameter + gam = g_new - g; // change in gradient + g = g_new; + + // shift remembered vectors to the right (forget last) + for(j = memory - 1; j > 0; j--) + { + for(i = 0; i < k; i++) + { + sigmas(i,j) = sigmas(i,j-1); + gammas(i,j) = gammas(i,j-1); + } + } + + // insert new vectors in left-most column + for(i = 0; i < k; i++) + { + sigmas(i, 0) = sig(i); + gammas(i, 0) = gam(i); + } + } + else // failed gradient - loose memory and use previous theta + { + sigmas.fill(0.0); + gammas.fill(0.0); + theta = theta - p; + } + } } // Want last iteration results? - if (verbosity == 2) - { - printf("\n================================================\n"); - printf("BFGSMIN final results\n"); - if (convergence == -1) printf("NO CONVERGENCE: maxiters exceeded\n"); - if ((convergence == 1) & (criterion == 1)) printf("STRONG CONVERGENCE\n"); - if ((convergence == 1) & !(criterion == 1)) printf("WEAK CONVERGENCE\n"); - if (convergence == 2) printf("NO CONVERGENCE: algorithm failed\n"); - if (have_gradient) printf("Used analytic gradient\n"); - else printf("Used numeric gradient\n"); - printf("\nObj. fn. value %f Iteration %d\n", obj_value, iter); - printf("Function conv %d Param conv %d Gradient conv %d\n", conv_fun, conv_param, conv_grad); - printf("\n param gradient change\n"); - for (j = 0; j<k; j++) - { - printf("%8.4f %8.4f %8.4f\n",theta(j),g(j),p(j)); - } - printf("================================================\n"); + if (verbosity > 0) + { - } + printf("\n======================================================\n"); + printf("BFGSMIN final results\n"); + printf("\n"); + if (memory > 0) printf("Used LBFGS, memory is last %d iterations\n",memory); + if (have_gradient) printf("Used analytic gradient\n"); + else printf("Used numeric gradient\n"); + printf("\n"); + printf("------------------------------------------------------\n"); + if (convergence == -1) printf("NO CONVERGENCE: max iters exceeded\n"); + if ((convergence == 1) & (criterion == 1)) printf("STRONG CONVERGENCE\n"); + if ((convergence == 1) & !(criterion == 1)) printf("WEAK CONVERGENCE\n"); + if (convergence == 2) printf("NO CONVERGENCE: algorithm failed\n"); + printf("Function conv %d Param conv %d Gradient conv %d\n", conv_fun, conv_param, conv_grad); + printf("------------------------------------------------------\n"); + printf("Objective function value %g\n", last_obj_value); + printf("Stepsize %g\n", stepsize); + printf("%d iterations\n", iter); + printf("------------------------------------------------------\n"); + printf("\n param gradient change\n"); + for (j = 0; j<k; j++) + { + printf("%8.4f %8.4f %8.4f\n",theta(j),g(j),p(j)); + } + } f_return(0) = theta; f_return(1) = obj_value; f_return(2) = convergence; Index: bfgsmin_example.m =================================================================== RCS file: /cvsroot/octave/octave-forge/main/optim/bfgsmin_example.m,v retrieving revision 1.1 retrieving revision 1.2 diff -u -d -r1.1 -r1.2 --- bfgsmin_example.m 7 Sep 2004 10:31:21 -0000 1.1 +++ bfgsmin_example.m 8 Sep 2004 14:10:43 -0000 1.2 @@ -16,14 +16,23 @@ 1; # This shows how to call bfgsmin.m + +# You can minimize w.r.t. any argument, but the argument minimization is done over +# must be a column vector. + +# It is possible to supply analytic derivatives. If the second return of the objective +# function is a column vector the same size as the minimization argument, it is assumed +# to be the gradient vector. + page_screen_output = 0; # example obj. fn function obj_value = objective1(x, y) - obj_value = log(1 + x'*x) + log(1 + y'*y); + z = y - ones(size(y)); + obj_value = log(1 + x'*x) + log(1 + z'*z); endfunction -# example obj. fn.: with gradient +# example obj. fn.: with gradient w.r.t. x function [obj_value, gradient] = objective2(x, y) obj_value = (1 + x'*x) + (1 + y'*y); gradient = 2*x; #(1/(1 + x'*x))*2*x; @@ -31,20 +40,20 @@ # Check bfgsmin, illustrating variations -printf("EXAMPLE 1: Numeric gradient\n"); +printf("\nEXAMPLE 1: Numeric gradient\n"); x0 = ones(2,1); y0 = 2*ones(2,1); control = {10,2,1,1}; # maxiters, verbosity, conv. reg., arg_to_min [theta, obj_value, convergence] = bfgsmin("objective1", {x0, y0}, control); printf("\n"); -printf("EXAMPLE 2: Analytic gradient\n"); +printf("\nEXAMPLE 2: Analytic gradient\n"); control = {50,2,1,1}; # maxiters, verbosity, conv. reg., arg_to_min [theta, obj_value, convergence] = bfgsmin("objective2", {x0, y0}, control); printf("\n"); -printf("EXAMPLE 3: Numeric gradient, non-default tolerances\n"); +printf("\nEXAMPLE 3: Numeric gradient, non-default tolerances\n"); printf("Note how the gradient tolerance can't be achieved with\n"); printf("numeric differentiation\n"); control = {100,2,1,1}; # maxiters, verbosity, conv. reg., arg_to_min @@ -52,8 +61,27 @@ [theta, obj_value, convergence] = bfgsmin("objective1", {x0, y0}, control, tols); printf("\n"); -printf("EXAMPLE 4: Funny case - min wrt 2nd arg."); +printf("\nEXAMPLE 4: Funny case - min wrt 2nd arg.\n"); control = {50,2,1,2}; # maxiters, verbosity, conv. reg., arg_to_min [theta, obj_value, convergence] = bfgsmin("objective1", {x0, y0}, control); +printf("\nEXAMPLE 5: Limited memory BFGS, Numeric gradient\n"); +x0 = ones(2,1); +y0 = 2*ones(2,1); +control = {10,2,1,1, 3}; # maxiters, verbosity, conv. reg., arg_to_min +[theta, obj_value, convergence] = bfgsmin("objective1", {x0, y0}, control); +printf("\n"); + + +printf("\nEXAMPLE 6: Limited memory BFGS, Analytic gradient\n"); +control = {50,2,1,1,3}; # maxiters, verbosity, conv. reg., arg_to_min +[theta, obj_value, convergence] = bfgsmin("objective2", {x0, y0}, control); +printf("\n"); + + + + + + + |