From: Aaron A. <aa...@us...> - 2007-12-13 01:31:19
|
Update of /cvsroot/jboost/jboost/scripts In directory sc8-pr-cvs6.sourceforge.net:/tmp/cvs-serv18895 Modified Files: resample.py Log Message: This script is pretty hacky Index: resample.py =================================================================== RCS file: /cvsroot/jboost/jboost/scripts/resample.py,v retrieving revision 1.1.1.1 retrieving revision 1.2 diff -C2 -d -r1.1.1.1 -r1.2 *** resample.py 16 May 2007 04:06:02 -0000 1.1.1.1 --- resample.py 13 Dec 2007 01:31:14 -0000 1.2 *************** *** 11,15 **** def usage(): print 'Usage: nfold.py --k=N --label=STRING --train=TRAINFILE' ! print ' --k=N times to resample' print ' --train=TRAINFILE the training file to oversample' print ' --label=STRING the label to resample (note that this needs to be a unique identifier,' --- 11,16 ---- def usage(): print 'Usage: nfold.py --k=N --label=STRING --train=TRAINFILE' ! print ' --k=N integer, times to resample integer' ! print ' --p=R number in [0,1] prob to keep nonlabel examples' print ' --train=TRAINFILE the training file to oversample' print ' --label=STRING the label to resample (note that this needs to be a unique identifier,' *************** *** 20,47 **** # a further slight speed-up on my box # is to map a bound-method: ! def sort_dict(d): keys = d.keys() vals = d.keys() ! keys.sort(lambda x,y : y-x) return [(key,d[key]) for key in keys] ! def generateFile(trainfile, oversample, labelstr): ! # XXX: put in description ! ! # load data and shuffle it f= file(trainfile, 'r') data= f.readlines() f.close() add_lines = {} ! i = 0 for line in data: end = len(line) ! if(line[end-4:end-1].rstrip() == ' 1;'): add_lines[i] = line i = i + 1 - add_lines = sort_dict(add_lines) for (key, val) in add_lines: --- 21,56 ---- # a further slight speed-up on my box # is to map a bound-method: ! def sort_dict(d, reverse=False): keys = d.keys() vals = d.keys() ! if (reverse): ! keys.sort(lambda x,y : y-x) ! else: ! keys.sort(lambda x,y : x-y) return [(key,d[key]) for key in keys] ! def generateFile(trainfile, oversample, undersample, labelstr): f= file(trainfile, 'r') data= f.readlines() f.close() + add_lines = {} ! num_positive_examples = 0 i = 0 for line in data: end = len(line) ! if line[end-4:end-1].rstrip() == ' 1;': add_lines[i] = line + num_positive_examples += 1 + elif line[end-4:end-1].rstrip() == '-1;': + # do nothing + x = 0 + else: + print 'You clearly think this script was well written... you are mistaken...' + usage() + sys.exit(2) i = i + 1 add_lines = sort_dict(add_lines) for (key, val) in add_lines: *************** *** 49,56 **** data.insert(key,val) trainsuffix= '.train' oversample = oversample + 1 ! tfilename = trainfile + '.' + str(oversample) + trainsuffix tfile= file(tfilename, 'w') tfile.writelines(data) --- 58,100 ---- data.insert(key,val) + + del_lines = {} + num_negative_examples = 0 + i = 0 + for line in data: + end = len(line) + if line[end-4:end-1].rstrip() == ' 1;': + # do nothing + x = 0 + elif line[end-4:end-1].rstrip() == '-1;': + del_lines[i] = 'd' + num_negative_examples += 1 + else: + print 'You clearly think this script was well written... you are mistaken...' + usage() + sys.exit(2) + i = i + 1 + + del_lines = sort_dict(del_lines, True) + random.seed(107) + num_lines_deleted = 0 + for (key, val) in del_lines: + r = random.random() + if r > undersample: + num_lines_deleted += 1; + data.pop(key) + + print 'Original number of examples:' + print '\tNegative examples:', num_negative_examples + print '\tPositive examples:', num_positive_examples + print 'After sampling:' + print '\tNegative examples:', num_negative_examples - num_lines_deleted + print '\tPositive examples:', num_positive_examples * (oversample + 1) + print 'Expected num lines deleted:', num_negative_examples * (1 - undersample) + print 'Actual num lines deleted:', num_lines_deleted trainsuffix= '.train' oversample = oversample + 1 ! tfilename = trainfile + '.' + str(oversample) + '.' + str(undersample) + trainsuffix tfile= file(tfilename, 'w') tfile.writelines(data) *************** *** 60,70 **** def main(): - # Usage: - - - try: opts, args = getopt.getopt(sys.argv[1:],'' , ! ['k=','train=','label=']) except getopt.GetoptError: print 'resample.py: Illegal argument\n' --- 104,110 ---- def main(): try: opts, args = getopt.getopt(sys.argv[1:],'' , ! ['k=','train=','label=','p=']) except getopt.GetoptError: print 'resample.py: Illegal argument\n' *************** *** 73,82 **** # parse options ! trainfile = k = label = None for opt,arg in opts: if (opt == '--train'): trainfile = arg elif (opt == '--k'): ! k = float(arg) elif (opt == '--label'): label = arg --- 113,124 ---- # parse options ! trainfile = k = p = label = None for opt,arg in opts: if (opt == '--train'): trainfile = arg elif (opt == '--k'): ! k = int(arg) ! elif (opt == '--p'): ! p = float(arg) elif (opt == '--label'): label = arg *************** *** 86,98 **** ! if (trainfile==None or k==None): usage() sys.exit(2) ! p = k - int(k) k = int(k) - 1 print 'Oversample by', k print 'Prob sample', p ! generateFile(trainfile, k, label) --- 128,158 ---- ! if trainfile==None: ! print 'Must specify file for resampling' usage() sys.exit(2) ! if k==None and p==None: ! print 'Must specify over/under sample quanitity' ! usage() ! sys.exit(2) ! ! if not k==None and k < 2: ! print 'k must be larger than 2 for anything to happen' ! sys.exit(2) ! ! if not p==None and p > 1: ! print 'p must be less than 1 for anything to happen' ! sys.exit(2) ! ! if p==None: ! p = 1 ! if k==None: ! k = 1 ! k = int(k) - 1 print 'Oversample by', k print 'Prob sample', p ! generateFile(trainfile, k, p, label) |