From: william b. <wil...@us...> - 2009-01-17 03:16:41
|
Update of /cvsroot/jboost/jboost/scripts In directory fdv4jf1.ch3.sourceforge.com:/tmp/cvs-serv27885/scripts Modified Files: AddRandomIndex.py VisualizeScores.py Added Files: VisualizeScores.DEMO VisualizeScores.README AddRandomIndex.README Log Message: added commandline support, readme and demo files Index: VisualizeScores.py =================================================================== RCS file: /cvsroot/jboost/jboost/scripts/VisualizeScores.py,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** VisualizeScores.py 15 Sep 2008 22:24:45 -0000 1.7 --- VisualizeScores.py 17 Jan 2009 02:00:31 -0000 1.8 *************** *** 1,152 **** ! import sys,os,os.path,re,math,glob import jarray from java.util import Vector ! args = sys.argv[1:] ! #if len(args) != 1: ! # sys.exit(""" ! # Usage: VisualizeScore <info-files-path> ! # info-file-path is a directory containing files with names like trial0.test.boosting.info, trial2.train.boosting.info ... ! # Example: perform the following commands from the jboost root directory ! # <setup classpath to point to jython> ! # source scripts/setClassPath.sh ! # cd demo ! # cat spambase.test spambase.train > spambase.data ! # ../scripts/nfold.py --folds=3 --data=spambase.data --spec=spambase.spec --rounds=43 --tree=ADD_ALL --generate --booster=LogLossBoost ! # jython ../scripts/VisualizeScores.py cvdata-09-09-17-00-55/ADD_ALL/ ! # """) ! #info_path = args[0] ! ! header_p = re.compile('iteration=(\d+): elements=(\d+):') ! scoreline_p = re.compile('([-+\d.]+):\s+') #([+-1]+):') ! ! info_path = "/Users/yoavfreund/projects/jboost/demo/forVisualize/run1/" ! ! testfiles = glob.glob(info_path+"trial*.test.boosting.info") ! trainfiles = glob.glob(info_path+"trial*.train.boosting.info") ! ! #testfiles = glob.glob("/Users/yoavfreund/Downloads/dc_boost_active_output/iter?/*.test.boosting.info") ! ! #testfiles = glob.glob("/Users/yoavfreund/projects/jboost/demo/cvdata-09-09-17-00-55/ADD_ALL/trial*.test.boosting.info") ! ! print testfiles ! exampleData={} ! ! #find the iteration indices ! ! infile = open(testfiles[0],'r') ! iterList=[] ! iterDict={} ! iterationIndex=0 ! for line in infile: ! [(iter,elements)] = header_p.findall(line) ! elements = int(elements) ! ! if iterDict.has_key(iter): ! sys.exit("data file "+testfiles[0]+" has two lists corresponding to iteration "+iter) ! iterList.append(iter) ! iterDict[iter]=iterationIndex ! iterationIndex=iterationIndex+1 ! ! for count in range(elements): ! line=infile.readline() ! ! from jboost.visualization import DataSet,DataElement,HistogramFrame ! ! d=DataSet(iterList) ! d.setOutputFilename(info_path+"/selectedExamples.txt") ! ! # read the data into the Dataset data structures. ! for filename in testfiles: ! print filename ! infile = open(filename,'r') ! iterList=[]; ! for line in infile: ! [(iter,elements)] = header_p.findall(line) ! iterationIndex=iterDict[iter] ! elements = int(elements) ! ! for count in range(elements): ! line=infile.readline() ! #print line ! a = scoreline_p.findall(line) ! index=int(a[1]) ! score=float(a[3]) ! label=int(a[-1]) ! if not exampleData.has_key(index): ! exampleData[index]={"label":label, "scores":{} } ! exampleData[index]["scores"][int(iter)] = {"test":score, "train":[]} ! e=DataElement(score,index,label) ! d.addDataElement(e,iterationIndex) ! ! d.preProcessDataset() ! lowestScore = d.getMin() ! highestScore = d.getMax() ! print "lowestScore=%f, highestScore=%f\n" % (lowestScore,highestScore) ! print len(exampleData) ! for filename in trainfiles: ! print filename ! infile = open(filename,'r') for line in infile: ! [(iter,elements)] = header_p.findall(line) ! iterationIndex=iterDict[iter] elements = int(elements) ! for count in range(elements): ! line=infile.readline() ! #print line ! a = scoreline_p.findall(line) ! index=int(a[1]) ! score=float(a[3]) ! label=int(a[-1]) ! if exampleData.has_key(index): ! exampleData[index]["scores"][int(iter)]["train"].append(score) ! #else: ! #print "missing index=%d\n" % index ! ! ! iterList = exampleData[0]["scores"].keys() ! iterList.sort() ! ! fluctBins=20 ! percentage=0.05 ! barHeight=6 ! binWidth = (highestScore-lowestScore)/fluctBins ! fluct = {} ! for iter in iterList: ! fluct[iter] = [ {"trainScores":[]} for i in range(fluctBins)] ! ! for iterNo in range(len(iterList)): ! iter = iterList[iterNo] ! print "iteration %d\n" % iter ! for index in exampleData.keys(): ! testScore=exampleData[index]["scores"][iter]["test"] ! bin = int((testScore-lowestScore)/binWidth) ! bin = min(bin,fluctBins-1) ! for trainScore in exampleData[index]["scores"][iter]["train"]: ! fluct[iter][bin]["trainScores"].append(trainScore) ! ! y=0 ! for bin in range(fluctBins): ! fluct[iter][bin]["trainScores"].sort() ! n=len(fluct[iter][bin]["trainScores"]) ! if n>10: ! bottom = fluct[iter][bin]["trainScores"][int(n*percentage)] ! top = fluct[iter][bin]["trainScores"][int(n*(1-percentage))] ! yMax = y ! yMin = y-barHeight ! y=y-1.1*barHeight ! fluct[iter][bin]["bottom"] = bottom ! fluct[iter][bin]["top"]=top ! fluct[iter][bin]["yMin"]=yMin ! fluct[iter][bin]["yMax"]=yMax ! ! print "[%f,%f] -> [%f,%f] X [%f,%f]\n" % (lowestScore+bin*binWidth,lowestScore+(bin+1)*binWidth,bottom,top,yMin,yMax) ! d.addFluctItems(iterNo, lowestScore+bin*binWidth, lowestScore+(bin+1)*binWidth, bottom, top, yMin, yMax) ! fluct[iter][bin]["trainScores"] = [] ! ! v=HistogramFrame(d) ! v.show() --- 1,163 ---- ! import sys, os, os.path, re, math, glob import jarray from java.util import Vector ! def usage(): ! print("Usage: VisualizeScores <info-files-path>") ! print("info-file-path is a directory containing files with names like trial0.test.boosting.info, trial2.train.boosting.info ...") ! print("Example: perform the following commands from the jboost root directory") ! print("<setup classpath to point to jython>") ! print("source scripts/setClassPath.sh") ! print("cd demo") ! print("cat spambase.test spambase.train > spambase.data") ! print("../scripts/nfold.py --folds=3 --data=spambase.data --spec=spambase.spec --rounds=43 --tree=ADD_ALL --generate --booster=LogLossBoost") ! print("jython ../scripts/VisualizeScores.py cvdata-09-09-17-00-55/ADD_ALL/trial*") ! print("") ! print("IMPORTANT NOTE: jboost should of been run on data and spec files passed through AddRandomIndex.py") ! def main(): ! args = sys.argv[1:] ! ! globpath = None ! if len(args) == 1: ! globpath = args[0] ! else: ! usage() ! sys.exit(2) ! header_p = re.compile('iteration=(\d+): elements=(\d+):') ! scoreline_p = re.compile('([-+\d.]+):\s+') #([+-1]+):') ! ! ! testfiles = glob.glob(globpath + "*.test.boosting.info") ! trainfiles = glob.glob(globpath + "*.train.boosting.info") ! ! #testfiles = glob.glob("/Users/yoavfreund/Downloads/dc_boost_active_output/iter?/*.test.boosting.info") ! ! #testfiles = glob.glob("/Users/yoavfreund/projects/jboost/demo/cvdata-09-09-17-00-55/ADD_ALL/trial*.test.boosting.info") ! ! print(testfiles) ! exampleData = {} ! ! #find the iteration indices ! ! infile = open(testfiles[0], 'r') ! iterList = [] ! iterDict = {} ! iterationIndex = 0 for line in infile: ! [(iter, elements)] = header_p.findall(line) elements = int(elements) ! ! if iterDict.has_key(iter): ! sys.exit("data file " + testfiles[0] + " has two lists corresponding to iteration " + iter) ! iterList.append(iter) ! iterDict[iter] = iterationIndex ! iterationIndex = iterationIndex + 1 ! for count in range(elements): ! line = infile.readline() ! ! from jboost.visualization import DataSet, DataElement, HistogramFrame ! ! d = DataSet(iterList) ! d.setOutputFilename(globpath + "/selectedExamples.txt") ! ! # read the data into the Dataset data structures. ! for filename in testfiles: ! print(filename) ! infile = open(filename, 'r') ! iterList = []; ! for line in infile: ! [(iter, elements)] = header_p.findall(line) ! iterationIndex = iterDict[iter] ! elements = int(elements) ! ! for count in range(elements): ! line = infile.readline() ! #print line ! a = scoreline_p.findall(line) ! index = int(a[1]) ! score = float(a[3]) ! label = int(a[ - 1]) ! if not exampleData.has_key(index): ! exampleData[index] = {"label":label, "scores":{} } ! exampleData[index]["scores"][int(iter)] = {"test":score, "train":[]} ! e = DataElement(score, index, label) ! d.addDataElement(e, iterationIndex) ! ! d.preProcessDataset() ! lowestScore = d.getMin() ! highestScore = d.getMax() ! print("lowestScore=%f, highestScore=%f\n" % (lowestScore, highestScore)) ! ! print(len(exampleData)) ! ! for filename in trainfiles: ! print(filename) ! infile = open(filename, 'r') ! for line in infile: ! [(iter, elements)] = header_p.findall(line) ! iterationIndex = iterDict[iter] ! elements = int(elements) ! ! for count in range(elements): ! line = infile.readline() ! #print line ! a = scoreline_p.findall(line) ! index = int(a[1]) ! score = float(a[3]) ! label = int(a[ - 1]) ! if exampleData.has_key(index): ! exampleData[index]["scores"][int(iter)]["train"].append(score) ! #else: ! #print "missing index=%d\n" % index ! ! ! # changed first index below from 0 to exampleData.keys()[0] : by boyko ! iterList = exampleData[exampleData.keys()[0]]["scores"].keys() ! iterList.sort() ! ! fluctBins = 20 ! percentage = 0.05 ! barHeight = 6 ! binWidth = (highestScore - lowestScore) / fluctBins ! fluct = {} ! for iter in iterList: ! fluct[iter] = [ {"trainScores":[]} for i in range(fluctBins)] ! ! for iterNo in range(len(iterList)): ! iter = iterList[iterNo] ! print("iteration %d\n" % iter) ! for index in exampleData.keys(): ! testScore = exampleData[index]["scores"][iter]["test"] ! bin = int((testScore - lowestScore) / binWidth) ! bin = min(bin, fluctBins - 1) ! for trainScore in exampleData[index]["scores"][iter]["train"]: ! fluct[iter][bin]["trainScores"].append(trainScore) ! ! y = 0 ! for bin in range(fluctBins): ! fluct[iter][bin]["trainScores"].sort() ! n = len(fluct[iter][bin]["trainScores"]) ! if n > 10: ! bottom = fluct[iter][bin]["trainScores"][int(n * percentage)] ! top = fluct[iter][bin]["trainScores"][int(n * (1 - percentage))] ! yMax = y ! yMin = y - barHeight ! y = y - 1.1 * barHeight ! fluct[iter][bin]["bottom"] = bottom ! fluct[iter][bin]["top"] = top ! fluct[iter][bin]["yMin"] = yMin ! fluct[iter][bin]["yMax"] = yMax ! ! print("[%f,%f] -> [%f,%f] X [%f,%f]\n" % (lowestScore + bin * binWidth, lowestScore + (bin + 1) * binWidth, bottom, top, yMin, yMax)) ! d.addFluctItems(iterNo, lowestScore + bin * binWidth, lowestScore + (bin + 1) * binWidth, bottom, top, yMin, yMax) ! fluct[iter][bin]["trainScores"] = [] ! ! ! v = HistogramFrame(d) ! v.show() ! if __name__ == "__main__": ! main() --- NEW FILE: VisualizeScores.DEMO --- Score visualization demo using spambase demo data and nfold validation: Assumptions ========== 0. you have java, jython, and python installed. 1. you have downloaded the jboost dist from sourceforge.net and built jboost from source. You CANNOT use the prebuilt jboost.jar (version 1.4 or earlier) as we are using a yet-unreleased version of jboost. 2. You setup your CLASSPATH and JBOOST_DIR environment variables per the install instructions on http://jboost.sourceforge.net/install.html 3. You are running on linux or mac. Instructions are for BASH shell. Mod as appropriate for your environment. The Demo steps: we are going to do nFold validation on the spambase demo data found in $JOOST_DIR/demo. ============== 1. add some libs to your (existing) java CLASSPATH: > export CLASSPATH=$CLASSPATH:$JBOOST_DIR/lib/jcommon-1.0.8.jar:$JBOOST_DIR/lib/jfreechart-1.0.10.jar:$JBOOST_DIR/lib/swing-layout-1.0.jar 2. cd to the demo directory > cd $JBOOST_DIR/demo 3. add an index number to each row of example data and shuffle the examples. We have a python script to do this for you. > ls spam* spambase.data spambase.test spambase.spec spambase.train > ../scripts/AddRandomIndex.py spambase wrote spambase_idx.data. wrote spambase_idx.spec. > ls spam* spambase.data spambase.test spambase_idx.data spambase.spec spambase.train spambase_idx.spec 4. Now we are going to perform Nfold crossvalidation using these new spambase_idx.* files. You can look a the nfold.py script to decipher the command line args. > ../scripts/nfold.py --booster=LogLossBoost --folds=2 --data=spambase_idx.data --spec=spambase_idx.spec --rounds=50 --tree=ADD_ALL --generate k: 0 start:0 end:2300 k: 1 start:2300 end:4600 *=---------------------------------------------------------------------=-* * Fold 0 | *============ java -Xmx1000M -cp :/Users/wbeaver/Documents/workspace/jboost/dist/jboost.jar:/Users/wbeaver/Documents/workspace/jboost/lib/concurrent.jar:/Users/wbeaver/Documents/workspace/jboost/lib/jcommon-1.0.8.jar:/Users/wbeaver/Documents/workspace/jboost/lib/jfreechart-1.0.10.jar:/Users/wbeaver/Documents/workspace/jboost/lib/swing-layout-1.0.jar jboost.controller.Controller -b LogLossBoost -p 3 -a -1 -S trial0 -n trial.spec -ATreeType ADD_ALL -numRounds 50 Fileloader adding . to path. WARNING: configuration file jboost.config not found. Continuing... Found trial.spec Found trial0.train Found trial.spec Found trial0.test Booster type: jboost.booster.LogLossBoost Read 100 training examples Read 200 training examples Read 300 training examples Read 400 training examples Read 500 training examples Read 600 training examples [snip] you get the idea. 5. the results are placed in the directory ./cvdata-mm-dd-hh-mm-ss/<TREE-TYPE> 6. Now run the visualizer in the scripts. Assuming jython is not on your path, be explicit: > ~/jython2.2.1/jython ../scripts/VisualizeScores.py cvdata-mm-dd-hh-mm-ss/ADD_ALL/trial (note, this example output shows the cvdata-mm-dd-hh-mm-ss for my test run) ['cvdata-01-16-17-44-23/ADD_ALL/trial0.test.boosting.info', 'cvdata-01-16-17-44-23/ADD_ALL/trial1.test.boosting.info'] cvdata-01-16-17-44-23/ADD_ALL/trial0.test.boosting.info cvdata-01-16-17-44-23/ADD_ALL/trial1.test.boosting.info index=0, a.length=4600 index=1, a.length=4600 index=2, a.length=4600 index=3, a.length=4600 index=4, a.length=4600 index=5, a.length=4600 index=6, a.length=4600 [snip] Assuming all is setup, you will then see the GUI. 7. Select a range of examples to output for a given iteration using the slider and press the "Print Example Indices" button in the lower left corner. This will save a file called "SelectedExamples.txt" in to the directory you ran the GUI from (in this case $JBOOST_DIR/demos) Using this file, you can write a parser to drill back to your original examples. [end] --- NEW FILE: AddRandomIndex.README --- AddRandomIndex.py ================= Add a random index [0..number of examples-1] to the first column of a jboost .data file and saves a copy of the .data file suffixed with "_idx" (for example: test.data becomes test_idx.data). Also adds a new column at feature position zero to the spec file called "INDEX" with type "number". Requirements ============ * Jython or Python * boosting .data and .spec files Usage ===== Invoke the script: [user@host scripts] $ python AddRandomIndex.py <path to data/spec file stem> where <path to data/spec file stem> is the full path to the stem of your data and spec files (assume both files located in the same directory). Example ======= [user@host scripts] $ ls /Users/jsmith/boostinginfo/ mydata.data mydata.spec [user@host scripts] $ python AddRandomIndex.py /Users/jsmith/boostinginfo/mydata wrote /Users/jsmith/boostinginfo/mydata_idx.data. wrote /Users/jsmith/boostinginfo/mydata_idx.spec. [user@host scripts] $ ls /Users/jsmith/boostinginfo/ mydata.data mydata_idx.data mydata.spec mydata_idx.spec Use these new *_idx.data and *_idx.spec file as your training data for jboost (via jboost directly or as data sources for nfold.py script). Index: AddRandomIndex.py =================================================================== RCS file: /cvsroot/jboost/jboost/scripts/AddRandomIndex.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** AddRandomIndex.py 11 Sep 2008 19:58:00 -0000 1.1 --- AddRandomIndex.py 17 Jan 2009 02:00:31 -0000 1.2 *************** *** 1,2 **** --- 1,15 ---- + #!/usr/bin/python + + import sys + import os.path + + def usage(): + print("Usage: AddRandomIndex.py <path to data and spec file stem name>") + print("Example: path to data and spec files are:") + print("\t/home/jsmith/myfiles/myinfo.data and /home/jsmith/myfiles/myinfo.spec") + print("Usage: AddRandomIndex.py /home/jsmith/myfiles/myinfo") + + + """ add an INDEX field to a jboost data file. INDEX is a randomly permuted number ranging *************** *** 5,34 **** an n-fold cross validation experiment. """ ! ! filename = "/Users/yoavfreund/projects/jboost/demo/spambase" ! ! datafile = open(filename+".data",'r') ! ! lines=[] ! morelines = datafile.readlines(100000) ! while len(morelines)>0: ! lines.extend(morelines) morelines = datafile.readlines(100000) ! datafile.close() ! ! length = len(lines) ! ! from random import shuffle ! shuffle(lines) ! ! newdatafile = open(filename+"I.data",'w') ! for i in range(length): ! newdatafile.write(("%d," % i)+lines[i]) ! newdatafile.close() ! --- 18,92 ---- an n-fold cross validation experiment. """ ! def main(): ! ! args = sys.argv[1:] ! abort = False ! stem_path = "" ! if len(args) == 1: ! stem_path = args[0] ! else: ! abort = True ! ! datafilepath = stem_path+".data" ! specfilepath = stem_path+".spec" ! ! if not os.path.isfile(datafilepath): ! abort = True ! print("cannot find " + datafilepath) ! ! if not os.path.isfile(specfilepath): ! abort = True ! print("cannot find " + specfilepath) ! ! if abort: ! usage() ! sys.exit(2) ! ! datafile = open(datafilepath,'r') ! ! lines=[] morelines = datafile.readlines(100000) ! while len(morelines)>0: ! lines.extend(morelines) ! morelines = datafile.readlines(100000) ! datafile.close() ! length = len(lines) ! ! from random import shuffle ! shuffle(lines) + newdatafilepath = stem_path+"_idx.data" + newdatafile = open(newdatafilepath,'w') + for i in range(length): + newdatafile.write(("%d," % i)+lines[i]) + + newdatafile.close() + print("wrote " + newdatafilepath + ".") + + # add INDEX feature to top of .spec File + spec_file = open(specfilepath,'r') + features = spec_file.readlines() + spec_file.close() + + i=0 + for idx,line in enumerate(features): + if line.startswith(('\n','exampleTerminator','attributeTerminator','maxBadExa','maxBadAtt')): + i=i+1 + else: + break + + + features.insert(i, "INDEX number\n") + + newspecfilepath = stem_path+"_idx.spec" + newspecfile = open(newspecfilepath,'w') + newspecfile.write("".join(features)) + newspecfile.close() + print("wrote " + newspecfilepath + ".") + if __name__ == "__main__": + main() --- NEW FILE: VisualizeScores.README --- VisualizeScores.py Visualize margin score distributions from training and test folds. Requirements ============ * Jython * build of jboost from current cvs src tree (NOT THE PREBUILT JBOOST.JAR VERSION 1.4) * test | train.boosting.info files which are indexed (see AddRandomIndex.py) * binary class labels (i.e. no multilabel support at this time) TERMS: $JBOOST_DIR is environment variable pointing to the root of your downloaded jboost dist. Usage ===== You must add the files from $JBOOST_DIR/lib that are not on your CLASSPATH to your CLASSPATH. If you followed the install instructions from the JBoost website, this would mean adding $JBOOST_DIR/lib/jcommon-1.0.8.jar, $JBOOST_DIR/lib/jfreechart-1.0.10.jar, and $JBOOST_DIR/lib/swing-layout-1.0.jar to the existing classpath (see example below). Once classpath is set, from command line invoke the script from Jython: [user@host jython2.2.1]$ ./jython $JBOOST_DIR/scripts/VisualizeScores.py <path to *.boosting.info files> Where <path to *.boosting.info files> is the full path to the .boosting.info files stem you would like to process. For example: /Users/jsmith/boostingdata/cvdata/ADD_ALL/trial will glob trial0.train.boosting.info, trial0.test.boosting.info, trial1.train.boosting.info, etc. Example ======= (this example is run from within Jython dist): [user@host jython2.2.1]$ export JBOOST_DIR=/Users/jsmith/workspace/jboost/ (export for a bash shell. ignore if already done) [user@host jython2.2.1]$ export CLASSPATH=$CLASSPATH:$JBOOST_DIR/lib/jcommon-1.0.8.jar:$JBOOST_DIR/lib/jfreechart-1.0.10.jar:$JBOOST_DIR/lib/swing-layout-1.0.jar (NOTE: ASSUMES you have already added jboost.jar and concurrent.jar to your classpath) [user@host jython2.2.1]$ ./jython $JBOOST_DIR/scripts/VisualizeScores.py /Users/jsmith/boostingdata/cvdata/ADD_ALL/trial After loading, parsing, and analyzing the boosting.info files, the GUI will launch. Boosting iteration shown in upper right. A histogram showing example count by boosting score for each label is predominate and in the center of the window. Sliders below the histogram allow you select lower and upper score ranges. The region within this range on the ROC curve is shown in the upper left of the window. Pressing the button in the lower left (save selected examples) will write to a file the example index and boosting score for each example within the selected range. This file is saved to the directory indicated in step 1 above. [end] |