[r80]: src / class_borders.cc  Maximize  Restore  History

Download this file

387 lines (327 with data), 13.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
//Copyright (C) 2007 Peter Mills. All rights reserved.
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <assert.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_linalg.h>
#include "full_util.h"
#include "linked.h"
#include "agf_lib.h"
#define WC_DEFAULT 20
#define SMALL_DATASET 200
using namespace std;
using namespace libagf;
using namespace libpetey;
int main(int argc, char *argv[]) {
char *vecfile; //back to the old way...
char *classfile;
char *brdfile; //binary file sampling border
char *grdfile; //binary file containing gradient vectors
char *normfile; //stores averages and standard deviations
agf_command_opts opt_args;
real_a **train; //training vectors
cls_ta *cls; //training data classes
nel_ta ntrain; //number of training data points
nel_ta ntrain2; //revised training samples (after excluding -1 classes)
dim_ta nvar=0; //number of variables
nel_ta n1; //number of class labels
real_a **border; //border vectors
real_a **gradient; //gradient vectors
FILE *fs; //file stream
cls_ta nclass; //number of classes (anything above two is ignored
//though...)
nel_ta *clind; //indices of sorted classes
nel_ta clind2; //for the two-class classification
//(relative to start of non-excluded data)
real_a *all; //for deleting the training vectors
dim_ta nvar1, nvar2; //check number of variables against other files
real_a *b; //constant term for normalization (the average, basicly for reverse compatibility)
real_a *std, *ave; //average, standard deviation...
real_a vart;
cls_ta *part1; //for partitioning more than 2 classes
cls_ta *part2;
int exit_code;
int err_code;
//set defaults and parse command line options:
opt_args.k=-1;
opt_args.W2=WC_DEFAULT;
exit_code=0;
//AFG options: -v -V -W -k
//borders options: -s -t -r -T
//normalization options: -n -S -a
//supernewton iteration: -h -i -I
exit_code=agf_parse_command_opts(argc, argv, "a:h:i:I:k:l:r:s:t:T:v:V:W:nuS:", &opt_args);
if (exit_code==FATAL_COMMAND_OPTION_PARSE_ERROR) return exit_code;
if (argc < 2) {
printf("\n");
printf("syntax: class_borders [-n] [-k k] [-W Wc] [-s n] [-t tol] train border\n");
printf(" [cls1a cls1b cls1c ... %c cls2a cls2b cls2c ...]\n", PARTITION_SYMBOL);
printf("\n");
printf("arguments:\n");
printf(" train binary files containing locations of the samples:\n");
printf(" .vec for vectors;\n");
printf(" .cls for classes\n");
printf(" border base name of output files:\n");
printf(" .brd samples the border;\n");
printf(" .bgd contains gradient vectors\n");
printf(" .std contains normalization data (unless -a specified)\n");
printf(" clsIJ for partitioning multiple classes: Jth member of the Ith partition\n");
printf("\n");
printf("options:\n");
printf(" -a normfile file containing normalization data (input/output)\n");
printf(" -h maxit1 maximum number of iterations when searching for class border (%d)\n", (int32_t) agf_global_borders_maxiter);
printf(" -i maxit2 maximum number of iterations when calculating weights (%d)\n", (int32_t) agf_global_weights_maxiter);
printf(" -I maxit3 maximum number of iterations in supernewton (%d, %d)\n", (int32_t) agf_global_weights_maxiter, (int32_t) agf_global_borders_maxiter);
printf(" -k k number of nearest neighbours to use in each calculation\n");
printf(" --default is to use all the data\n");
printf(" -l tol tolerance of W (default=%g)\n", (float) agf_global_weights_tol);
printf(" -n option to normalise the data\n");
printf(" -r r0 location of discrimination border (default=0)\n");
printf(" -s n number of times to sample the border (default=%d)\n", (int32_t) opt_args.n);
printf(" -t tol tolerance of border samples (default=%g)\n", (float) opt_args.tol);
printf(" -T cthresh class threshold (default=%d)\n", 1);
printf(" -u store borders data in un-normalized coordinates\n");
printf(" -v var1 lower filter variance bracket\n");
printf(" --default is to use (the total variance of the data)/n^(2/D)\n");
printf(" -V var2 upper filter variance bracket\n");
printf(" --default is to use the total variance of the data\n");
printf(" -W Wc objective total weight (default=%g)\n", (float) opt_args.W2);
printf("\n");
return INSUFFICIENT_COMMAND_ARGS;
}
ave=NULL;
std=NULL;
b=NULL;
vecfile=new char[strlen(argv[0])+5];
sprintf(vecfile, "%s.vec", argv[0]);
//if we need a normalization file and one hasn't been named,
//construct the name:
if ((opt_args.svd>0 || opt_args.normflag) && opt_args.normfile == NULL) {
opt_args.normfile=new char[strlen(argv[1])+5];
sprintf(opt_args.normfile, "%s.std", argv[1]);
}
//get the training co-ordinate data, pre-process if necessary
train=agf_get_features(argv[0], &opt_args, nvar, ntrain);
all=train[0];
classfile=new char[strlen(argv[0])+5];
sprintf(classfile, "%s.cls", argv[0]);
cls=read_clsfile(classfile, n1);
if (n1 == -1) {
fprintf(stderr, "Error reading file: %s\n", classfile);
exit(FILE_READ_ERROR);
}
if (cls == NULL) {
fprintf(stderr, "Unable to open file, %s, for reading.\n", classfile);
exit(UNABLE_TO_OPEN_FILE_FOR_READING);
}
if (n1!=ntrain) {
fprintf(stderr, "Sample count mismatch: %d in %s, %d in %s.\n", ntrain, vecfile, n1, classfile);
exit(SAMPLE_COUNT_MISMATCH);
}
printf("%d %d-dimensional training vectors found: %s\n", ntrain, nvar, argv[0]);
//count the number of classes:
nclass=1;
for (nel_ta i=0; i<ntrain; i++) if (cls[i]>=nclass) nclass=cls[i]+1;
if (nclass < 2) {
fprintf(stderr, "class_borders: Cannot perform classifications with less than two classes!\n");
return PARAMETER_OUT_OF_RANGE;
}
if (opt_args.cl_thresh >= nclass) {
fprintf(stderr, "class_borders: Class threshold greater than number of classes.\n");
return PARAMETER_OUT_OF_RANGE;
}
//if there are partitions:
if (argc>2) {
cls_ta map[nclass];
cls_ta nncls=1; //new number of classes
err_code=parse_partition(argc-2, argv+2, nclass, map);
if (err_code!=0) {
fprintf(stderr, "class_borders: error parsing class partition\n");
exit(err_code);
}
apply_partition(cls, ntrain, map);
for (cls_ta i=0; i<nclass; i++) if (map[i]>=nncls) nncls=map[i]+1;
nclass=nncls;
}
brdfile=new char[strlen(argv[1])+5];
strcpy(brdfile, argv[1]);
strcat(brdfile, ".brd");
grdfile=new char[strlen(argv[1])+5];
strcpy(grdfile, argv[1]);
strcat(grdfile, ".bgd");
//dammit: recalculate the variances, even if they've just been calculated, above??
if (opt_args.var[0] <= 0 || opt_args.var[1] <= 0) {
//calculate the averages and standard deviations:
std=new real_a[nvar];
ave=new real_a[nvar];
calc_norm(train, nvar, ntrain, ave, std);
printf("Statistics:\n");
print_stats(stdout, ave, std, nvar);
printf("\n");
//if the initial filter variance is not set, set it to the total
//variance of the data:
vart=0;
for (dim_ta i=0; i<nvar; i++) vart+=std[i]*std[i];
if (opt_args.var[0] <= 0) {
opt_args.var[0]=vart/pow(ntrain, 2./nvar);
printf("Using %10.3g for lower filter variance bracket\n\n", opt_args.var[0]);
}
if (opt_args.var[1] <= 0) {
opt_args.var[1]=vart;
printf("Using %10.3g for upper filter variance bracket\n\n", opt_args.var[1]);
}
}
//sort the classes:
clind=sort_classes(train, ntrain, cls, nclass);
ntrain2=clind[nclass]-clind[0];
clind2=clind[opt_args.cl_thresh]-clind[0];
//check the range of k:
if (opt_args.k <= opt_args.W2 || opt_args.k >= ntrain2) {
if (opt_args.k != -1) {
fprintf(stderr, "class_borders: Parameter k=%d out of range. Using all the training data.\n", opt_args.k);
opt_args.k=-1;
exit_code=PARAMETER_OUT_OF_RANGE;
}
}
//allocate the arrays for holding the results:
border=allocate_matrix<real_a, nel_ta>(opt_args.n, nvar);
gradient=allocate_matrix<real_a, nel_ta>(opt_args.n, nvar);
//find class borders:
//if (ntrain < SMALL_DATASET) {
//should really be based on the ratio between possible combinations
//(in this case approximate) and the desired number:
//printf("cb: test=%f\n", (2.*opt_args.n)/(ntrain-clind[1])/clind[1]);
if ((2.*opt_args.n)/(ntrain2-clind2)/clind2 > 0.25) {
find_class_borders_small(train+clind[0], nvar, ntrain2, clind2, opt_args.n,
opt_args.var, opt_args.k, opt_args.W2, opt_args.tol,
border, gradient, opt_args.rthresh);
} else {
find_class_borders(train+clind[0], nvar, ntrain2, clind2, opt_args.n,
opt_args.var, opt_args.k, opt_args.W2, opt_args.tol,
border, gradient, opt_args.rthresh);
}
//un-normalize the vectors before writing them to a file:
//(only if -u set)
if (opt_args.uflag && opt_args.normfile!=NULL) {
real_a **bord2;
real_a **grad2;
real_a **mat;
gsl_matrix *mat2;
gsl_vector *xy;
gsl_vector *b;
gsl_vector *s;
gsl_matrix *vt;
gsl_vector *work;
//is this efficient enough?
mat=read_stats2(opt_args.normfile, ave, nvar1, nvar2);
//too lazy to do this properly at the moment:
if (mat==NULL || nvar1==-1 || nvar2==-1) {
fprintf(stderr, "class_borders: error reading normalization file, %s\n", opt_args.normfile);
exit(FILE_READ_ERROR);
}
if (nvar2!=nvar) {
fprintf(stderr, "class_borders: second dimension of tranformation matrix (%d) does not match that of features data (%d)\n", nvar2, nvar);
exit(DIMENSION_MISMATCH);
}
printf("Solving to convert borders data back to un-transformed coordinates...\n");
if (nvar<nvar1) fprintf(stderr, "class_borders: warning, inverse coord. transformation may be under-determined\n");
//xy=gsl_vector_alloc(nvar1);
//b=gsl_vector_alloc(nvar);
mat2=gsl_matrix_alloc(nvar1, nvar);
s=gsl_vector_alloc(nvar);
vt=gsl_matrix_alloc(nvar, nvar);
work=gsl_vector_alloc(nvar);
for (dim_ta i=0; i<nvar1; i++) {
for (dim_ta j=0; j<nvar; j++) {
gsl_matrix_set(mat2, i, j, mat[i][j]);
}
}
//use SVD to do the inversion
//since it will still work with transformation matrices
//that reduce the dimension--if the dimension reduction has been done
//effectively, the results might even be somewhat sensible
gsl_linalg_SV_decomp(mat2, vt, s, work);
//transpose decomposition:
gsl_vector_free(work);
bord2=allocate_matrix<real_a, nel_ta>(opt_args.n, nvar1);
grad2=allocate_matrix<real_a, nel_ta>(opt_args.n, nvar1);
for (nel_ta i=0; i<opt_args.n; i++) {
double tmp_b;
double tmp_g;
for (dim_ta j=0; j<nvar1; j++) {
bord2[i][j]=ave[j];
grad2[i][j]=0;
for (dim_ta k=0; k<nvar; k++) {
double s_k=gsl_vector_get(s, k);
if (s_k == 0) continue;
tmp_b=0;
tmp_g=0;
for (dim_ta l=0; l<nvar; l++) {
tmp_b+=gsl_matrix_get(vt, l, k)*border[i][l];
tmp_g+=gsl_matrix_get(vt, l, k)*gradient[i][l];
}
bord2[i][j]+=tmp_b*gsl_matrix_get(mat2, j, k)/s_k;
grad2[i][j]+=tmp_g*gsl_matrix_get(mat2, j, k)/s_k;
}
}
/*
for (dim_ta k=0; k<nvar; k++) {
gsl_vector_set(b, k, border[i][k]);
}
//invert the transformation matrix:
gsl_linalg_SV_solve(vt, mat2, s, b, xy);
for (dim_ta j=0; j<nvar1; j++) {
border[i][j]=gsl_vector_get(xy, j)+ave[j];
}
for (dim_ta k=0; k<nvar; k++) {
gsl_vector_set(b, k, gradient[i][k]);
}
gsl_linalg_SV_solve(vt, mat2, s, b, xy);
for (dim_ta j=0; j<nvar1; j++) {
gradient[i][j]=gsl_vector_get(xy, j);
}
*/
}
nvar=nvar1;
delete_matrix(border);
delete_matrix(gradient);
border=bord2;
gradient=grad2;
delete_matrix(mat);
gsl_matrix_free(mat2);
//gsl_vector_free(xy);
//gsl_vector_free(b);
gsl_vector_free(s);
gsl_matrix_free(vt);
}
//write them to a file:
fs=fopen(brdfile, "w");
fwrite(&nvar, sizeof(nvar), 1, fs);
fwrite(border[0], sizeof(real_a), nvar*opt_args.n, fs);
fclose(fs);
fs=fopen(grdfile, "w");
fwrite(&nvar, sizeof(nvar), 1, fs);
fwrite(gradient[0], sizeof(real_a), nvar*opt_args.n, fs);
fclose(fs);
//clean up:
//delete character strings containing file names:
delete [] vecfile;
delete [] classfile;
delete[] brdfile;
delete[] grdfile;
//delete integer and real_aing point arrays:
delete[] train;
delete[] all;
if (ave != NULL) delete[] ave;
if (std != NULL) delete[] std;
delete[] clind;
delete [] cls;
delete_matrix(border);
delete_matrix(gradient);
if (opt_args.normfile!=NULL) {
delete [] opt_args.normfile;
}
return exit_code;
}

Get latest updates about Open Source Projects, Conferences and News.

Sign up for the SourceForge newsletter:





No, thanks