From: Yoav F. <yf...@us...> - 2008-09-11 04:14:45
|
Update of /cvsroot/jboost/jboost/scripts In directory sc8-pr-cvs17.sourceforge.net:/tmp/cvs-serv24863/scripts Modified Files: VisualizeScores.py Log Message: First version of visualizeScores.py that computes fluctuations in scores between folds. Index: VisualizeScores.py =================================================================== RCS file: /cvsroot/jboost/jboost/scripts/VisualizeScores.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** VisualizeScores.py 10 Sep 2008 00:19:25 -0000 1.4 --- VisualizeScores.py 11 Sep 2008 04:14:40 -0000 1.5 *************** *** 4,33 **** args = sys.argv[1:] ! if len(args) != 1: ! sys.exit(""" ! Usage: VisualizeScore <info-files-path> ! info-file-path is a directory containing files with names like trial0.test.boosting.info, trial2.train.boosting.info ... ! Example: perform the following commands from the jboost root directory ! <setup classpath to point to jython> ! source scripts/setClassPath.sh ! cd demo ! cat spambase.test spambase.train > spambase.data ! ../scripts/nfold.py --folds=3 --data=spambase.data --spec=spambase.spec --rounds=43 --tree=ADD_ALL --generate --booster=LogLossBoost ! jython ../scripts/VisualizeScores.py cvdata-09-09-17-00-55/ADD_ALL/ ! """) ! info_path = args[0] header_p = re.compile('iteration=(\d+): elements=(\d+):') scoreline_p = re.compile('([-+\d.]+):\s+') #([+-1]+):') testfiles = glob.glob(info_path+"trial*.test.boosting.info") #testfiles = glob.glob("/Users/yoavfreund/Downloads/dc_boost_active_output/iter?/*.test.boosting.info") #testfiles = glob.glob("/Users/yoavfreund/projects/jboost/demo/cvdata-09-09-17-00-55/ADD_ALL/trial*.test.boosting.info") print testfiles ! indexes={} ! scores={} ! labels={} #find the iteration indices --- 4,35 ---- args = sys.argv[1:] ! #if len(args) != 1: ! # sys.exit(""" ! # Usage: VisualizeScore <info-files-path> ! # info-file-path is a directory containing files with names like trial0.test.boosting.info, trial2.train.boosting.info ... ! # Example: perform the following commands from the jboost root directory ! # <setup classpath to point to jython> ! # source scripts/setClassPath.sh ! # cd demo ! # cat spambase.test spambase.train > spambase.data ! # ../scripts/nfold.py --folds=3 --data=spambase.data --spec=spambase.spec --rounds=43 --tree=ADD_ALL --generate --booster=LogLossBoost ! # jython ../scripts/VisualizeScores.py cvdata-09-09-17-00-55/ADD_ALL/ ! # """) ! #info_path = args[0] header_p = re.compile('iteration=(\d+): elements=(\d+):') scoreline_p = re.compile('([-+\d.]+):\s+') #([+-1]+):') + info_path = "/Users/yoavfreund/projects/jboost/demo/cvdata-09-10-13-54-55/ADD_ALL/" + testfiles = glob.glob(info_path+"trial*.test.boosting.info") + trainfiles = glob.glob(info_path+"trial*.train.boosting.info") #testfiles = glob.glob("/Users/yoavfreund/Downloads/dc_boost_active_output/iter?/*.test.boosting.info") #testfiles = glob.glob("/Users/yoavfreund/projects/jboost/demo/cvdata-09-09-17-00-55/ADD_ALL/trial*.test.boosting.info") + print testfiles ! exampleData={} #find the iteration indices *************** *** 56,68 **** # read the data into the Dataset data structures. for filename in testfiles: - print filename infile = open(filename,'r') iterList=[]; for line in infile: - print line [(iter,elements)] = header_p.findall(line) iterationIndex=iterDict[iter] elements = int(elements) - print "iter=%s iterationIndex=%d elements=%d\n" % (iter,iterationIndex, elements) for count in range(elements): --- 58,67 ---- *************** *** 70,86 **** #print line a = scoreline_p.findall(line) ! index=int(a[0]) ! score=float(a[2]) label=int(a[-1]) e=DataElement(score,index,label) d.addDataElement(e,iterationIndex) - print "going to preprocess\n" d.preProcessDataset() ! print "finisehd preprocessing\n" ! print "min= %f, max=%f\n" % (d.getMin(),d.getMax()) ! v=HistogramFrame(d) - v.show() --- 69,157 ---- #print line a = scoreline_p.findall(line) ! index=int(a[1]) ! score=float(a[3]) label=int(a[-1]) + if not exampleData.has_key(index): + exampleData[index]={"label":label, "scores":{} } + exampleData[index]["scores"][int(iter)] = {"test":score, "train":[]} e=DataElement(score,index,label) d.addDataElement(e,iterationIndex) d.preProcessDataset() ! lowestScore = d.getMin() ! highestScore = d.getMax() ! print "lowestScore=%f, highestScore=%f\n" % (lowestScore,highestScore) ! v=HistogramFrame(d) ! v.show() ! print len(exampleData) ! for filename in trainfiles: ! print filename ! infile = open(filename,'r') ! for line in infile: ! [(iter,elements)] = header_p.findall(line) ! iterationIndex=iterDict[iter] ! elements = int(elements) ! ! for count in range(elements): ! line=infile.readline() ! #print line ! a = scoreline_p.findall(line) ! index=int(a[1]) ! score=float(a[3]) ! label=int(a[-1]) ! if exampleData.has_key(index): ! exampleData[index]["scores"][int(iter)]["train"].append(score) ! #else: ! #print "missing index=%d\n" % index ! ! ! iterList = exampleData[0]["scores"].keys() ! iterList.sort() ! ! fluctBins=10 ! fluct = {} ! for iter in iterList: ! fluct[iter] = [{"sum":0.0, "sumSquare":0.0, "count":0} for i in range(fluctBins)] ! ! for index in exampleData.keys(): ! for iter in iterList: ! testScore=exampleData[index]["scores"][iter]["test"] ! maxDiff=0 ! for trainScore in exampleData[index]["scores"][iter]["train"]: ! maxDiff = max(maxDiff,abs(trainScore-testScore)) ! exampleData[index]["scores"][iter]["maxDiff"]=maxDiff ! bin = int((testScore-lowestScore)/(highestScore-lowestScore)*fluctBins) ! bin = min(bin,fluctBins-1) ! fluct[iter][bin]["count"] += 1 ! fluct[iter][bin]["sum"] += maxDiff ! fluct[iter][bin]["sumSquare"] += maxDiff*maxDiff ! ! from math import sqrt ! ! step = (highestScore-lowestScore)/fluctBins ! for iter in iterList: ! print "iteration %d\n" % iter ! for bin in range(fluctBins): ! count = fluct[iter][bin]["count"] ! if count>0: ! mean = fluct[iter][bin]["sum"] / count ! meanSquare = fluct[iter][bin]["sumSquare"] / count ! std = sqrt(meanSquare-mean*mean) ! else: ! mean=0 ! std=0 ! fluct[iter][bin]["mean"]=mean ! fluct[iter][bin]["std"]=std ! fl=fluct[iter][bin] ! r=lowestScore + bin*step ! print "\t[%f,%f] count=%d, mean=%f, std=%f\n" % (r,r+step,count,mean,std) ! ! #for index in range(5): ! # print "%d: label=%d\n" % (index,exampleData[index]["label"]) ! # for iterNo in iterList: ! # print ("%d: " % iterNo) ! # print exampleData[index]["scores"][iterNo] ! |