[marf-cvs] marf/src/marf/Classification/Stochastic Stochastic.java, 1.25, 1.25.4.1 ZipfLaw.java, 1.
Brought to you by:
mokhov
From: Serguei A. M. <mo...@us...> - 2008-04-09 00:32:46
|
Update of /cvsroot/marf/marf/src/marf/Classification/Stochastic In directory sc8-pr-cvs16.sourceforge.net:/tmp/cvs-serv9497/src/marf/Classification/Stochastic Modified Files: Tag: DISTRIBUTED_MARF_0_3_0_INTEGRATION Stochastic.java ZipfLaw.java Log Message: Consolidate most of the differences between the MAIN and Distributed MARF branches. This primarily includes the copyright year update to 2008, SpeakerIdentApp with the new options of loaders and the BandStopFilter, ZipfLaw, CosineSimilarityMeasure, and HammingDistance. Include stats per config along with stats per subject. Add timing measurements. A lot of comment spell checks and other corrections. Generalize some of the code, fix naming conventions in places. Fix some bugs that popped up in the main branch down to DMARF. Add some new implementation developments, such as nlp.Storage and Classification.Similarity. Refactor Configuration and some other classes. Make WAVLoader more customizable, in particular being able to load 16000 Hz samples. Many other fixes that will be summarized in the MAIN's ChangeLog. Index: Stochastic.java =================================================================== RCS file: /cvsroot/marf/marf/src/marf/Classification/Stochastic/Stochastic.java,v retrieving revision 1.25 retrieving revision 1.25.4.1 diff -C2 -d -r1.25 -r1.25.4.1 *** Stochastic.java 31 Jul 2006 02:19:13 -0000 1.25 --- Stochastic.java 9 Apr 2008 00:32:11 -0000 1.25.4.1 *************** *** 12,16 **** * <p>TODO: partially implemented.</p> * ! * <p>$Id$</p> * * @author Serguei Mokhov --- 12,16 ---- * <p>TODO: partially implemented.</p> * ! * $Id$ * * @author Serguei Mokhov Index: ZipfLaw.java =================================================================== RCS file: /cvsroot/marf/marf/src/marf/Classification/Stochastic/ZipfLaw.java,v retrieving revision 1.30 retrieving revision 1.30.4.1 diff -C2 -d -r1.30 -r1.30.4.1 *** ZipfLaw.java 3 Sep 2006 20:56:01 -0000 1.30 --- ZipfLaw.java 9 Apr 2008 00:32:12 -0000 1.30.4.1 *************** *** 8,13 **** --- 8,16 ---- import marf.Classification.ClassificationException; + import marf.Classification.Distance.DiffDistance; import marf.FeatureExtraction.IFeatureExtraction; + import marf.Stats.StatisticalObject; import marf.Stats.WordStats; + import marf.Storage.Result; import marf.Storage.StorageException; import marf.util.SortComparator; *************** *** 31,35 **** * @since 0.3.0.5 */ ! public static final int OUTPUT_PAGE_SIZE = 100; /** --- 34,38 ---- * @since 0.3.0.5 */ ! public static final int DEFAULT_OUTPUT_PAGE_SIZE = 100; /** *************** *** 40,45 **** /** * Sorted references to stats. */ ! private WordStats[] aoSortedStatRefs = null; /** --- 43,51 ---- /** * Sorted references to stats. + * As of 0.3.0.6 was set to the base type StatisticalObject + * instead of WordStats to allow other than word elements. */ ! // private WordStats[] aoSortedStatRefs = null; ! private StatisticalObject[] aoSortedStatRefs = null; /** *************** *** 61,64 **** --- 67,77 ---- /** + * When the results are dumped in the text mode, tell how + * many records to show per page. + * @since 0.3.0.6 + */ + private int iOutputPageSize = DEFAULT_OUTPUT_PAGE_SIZE; + + /** * For serialization versioning. * When adding new members or make other structural *************** *** 84,92 **** /** * Classification API. ! * @param poFeatureExtraction preprcessing module to get the data from */ public ZipfLaw(IFeatureExtraction poFeatureExtraction) { super(poFeatureExtraction); } --- 97,263 ---- /** * Classification API. ! * @param poFeatureExtraction preprocessing module to get the data from */ public ZipfLaw(IFeatureExtraction poFeatureExtraction) { super(poFeatureExtraction); + this.strFilename = getTrainingSetFilename().replaceAll("marf.Storage.TrainingSet", getClass().getName()); + this.oStats = new Hashtable(); + this.oObjectToSerialize = this; + } + + /** + * @since 0.3.0.6 + * @see marf.Classification.IClassification#classify(double[]) + */ + public boolean classify(double[] padFeatureVector) + throws ClassificationException + { + try + { + // Unseen data + collectStatistics(padFeatureVector); + + // Back up its stats + Hashtable oBackupStats = (Hashtable)this.oStats.clone(); + StatisticalObject[] aoSortedDataBackup = (StatisticalObject[])this.aoSortedStatRefs.clone(); + + // Restore data from the training set + restore(); + + double[] adUnseenData = new double[aoSortedDataBackup.length]; + double[] adSeenData = new double[this.aoSortedStatRefs.length]; + + // The unseen and trained vectors have to have identical + // observations on the LHS. The ones that are missing on + // either one get a frequency of zero. This is required + // for meaningful component-wise distance comparison + // afterwards as raw percentages will not do that. + // This can be approximated with the DiffDistance classifier + // as a temporary workaround + // XXX + + // Compute totals prior conversion to percentages + int iUnseenObservationsTotal = 0; + int iTrainingObservationsTotal = 0; + + for(int i = 0; i < aoSortedDataBackup.length; i++) + { + iUnseenObservationsTotal += aoSortedDataBackup[i].getFrequency(); + } + + for(int i = 0; i < this.aoSortedStatRefs.length; i++) + { + iTrainingObservationsTotal += this.aoSortedStatRefs[i].getFrequency(); + } + + // Convert unseen and stored data to just plain double[] arrays + for(int i = 0; i < aoSortedDataBackup.length; i++) + { + adUnseenData[i] = (double)aoSortedDataBackup[i].getFrequency() / iUnseenObservationsTotal; + } + + for(int i = 0; i < this.aoSortedStatRefs.length; i++) + { + adSeenData[i] = (double)this.aoSortedStatRefs[i].getFrequency() / iTrainingObservationsTotal; + } + + // Compare the unseen and stored data using a specified + // Distance classifier + // XXX + //DiffDistance oDistance = ClassifcationFactory.create(this.iDistanceMethod); + DiffDistance oDistance = new DiffDistance(null); + double dDistance = oDistance.distance(adSeenData, adUnseenData); + this.oResultSet.addResult(1, dDistance); + + return true; + } + catch(ClassificationException e) + { + throw e; + } + catch(Exception e) + { + throw new ClassificationException(e); + } + } + + /** + * @since 0.3.0.6 + * @see marf.Classification.IClassification#train(double[]) + */ + public boolean train(double[] padFeatureVector) + throws ClassificationException + { + try + { + restore(); + collectStatistics(padFeatureVector); + dump(); + + return true; + } + catch(ClassificationException e) + { + throw e; + } + catch(Exception e) + { + throw new ClassificationException(e); + } + } + + /** + * @since 0.3.0.6 + * @see marf.Classification.IClassification#getResult() + */ + public Result getResult() + { + // TODO Auto-generated method stub + return super.getResult(); + } + + /** + * Collects result statistics. + * TODO: employ StatsCollector. + * @param padFeatures desired stream tokenizer + * @throws ClassificationException in case of inner exceptions + */ + public final void collectStatistics(double[] padFeatures) + throws ClassificationException + { + try + { + // All items are one unit in length + this.iMinWordLength = this.iMaxWordLength = 1; + + // Collect Stats + for(int i = 0; i < padFeatures.length; i++) + { + // Look up if we already have an entry for this. + StatisticalObject oFeatureStats = (StatisticalObject)this.oStats.get(new Double(padFeatures[i])); + + // New entry + if(oFeatureStats == null) + { + oFeatureStats = new StatisticalObject(1); + this.oStats.put(new Double(padFeatures[i]), oFeatureStats); + } + + // Update existing entry + else + { + oFeatureStats.incFrequency(); + } + } + + // Sort and assign ranks + sort(); + rankAll(); + } + catch(RuntimeException e) + { + throw new ClassificationException(e); + } } *************** *** 161,165 **** private void sort() { ! this.aoSortedStatRefs = (WordStats[])oStats.values().toArray(new WordStats[0]); marf.util.Arrays.sort(aoSortedStatRefs, new FrequencyComparator(SortComparator.DESCENDING)); } --- 332,337 ---- private void sort() { ! // this.aoSortedStatRefs = (WordStats[])oStats.values().toArray(new WordStats[0]); ! this.aoSortedStatRefs = (StatisticalObject[])oStats.values().toArray(new StatisticalObject[0]); marf.util.Arrays.sort(aoSortedStatRefs, new FrequencyComparator(SortComparator.DESCENDING)); } *************** *** 170,176 **** private final void rankAll() { ! for(int i = 0; i < aoSortedStatRefs.length; i++) { ! aoSortedStatRefs[i].setRank(i + 1); } } --- 342,348 ---- private final void rankAll() { ! for(int i = 0; i < this.aoSortedStatRefs.length; i++) { ! this.aoSortedStatRefs[i].setRank(i + 1); } } *************** *** 183,187 **** System.out.println("f = Frequency, r = Rank"); ! for(int i = 0; i < aoSortedStatRefs.length; i += 10 * OUTPUT_PAGE_SIZE) { System.out.println --- 355,359 ---- System.out.println("f = Frequency, r = Rank"); ! for(int i = 0; i < this.aoSortedStatRefs.length; i += 10 * this.iOutputPageSize) { System.out.println *************** *** 189,193 **** "\n" + "---------------------------------\n" + ! "Words from " + (i + 1) + " to " + (i + OUTPUT_PAGE_SIZE) + "\n" + "---------------------------------\n\n" ); --- 361,365 ---- "\n" + "---------------------------------\n" + ! "Words from " + (i + 1) + " to " + (i + this.iOutputPageSize) + "\n" + "---------------------------------\n\n" ); *************** *** 195,224 **** System.out.println("Columns: r, f, f*r, word"); for ( int j = 0; ! j < (aoSortedStatRefs.length - i > OUTPUT_PAGE_SIZE ? OUTPUT_PAGE_SIZE : aoSortedStatRefs.length - i); j++ ) { ! System.out.println ! ( ! aoSortedStatRefs[i + j].getRank() + "\t" + ! aoSortedStatRefs[i + j].getFrequency() + "\t" + ! aoSortedStatRefs[i + j].getFrequency() * aoSortedStatRefs[i + j].getRank() + "\t" + ! aoSortedStatRefs[i + j].getLexeme() ! ); } } // Frequency count ! int aiFrequencies[] = new int[OUTPUT_PAGE_SIZE]; int iCurrFrequency = 1; ! for(int i = aoSortedStatRefs.length - 1; i > 0; i--) { //Debug.debug("freq: " + iCurrFrequency + ", i=" + i + ", len = " + aoSortedStatRefs.length); ! if(aoSortedStatRefs[i].getFrequency() == iCurrFrequency) { // Such a frequency happened before --- 367,406 ---- System.out.println("Columns: r, f, f*r, word"); + StringBuffer oStatsDump = new StringBuffer(); + for ( int j = 0; ! j < (this.aoSortedStatRefs.length - i > this.iOutputPageSize ? this.iOutputPageSize : this.aoSortedStatRefs.length - i); j++ ) { ! StatisticalObject oStatsItem = this.aoSortedStatRefs[i + j]; ! ! oStatsDump ! .append(oStatsItem.getRank()).append("\t") ! .append(oStatsItem.getFrequency()).append("\t") ! .append(oStatsItem.getFrequency() * oStatsItem.getRank()).append("\t"); ! ! if(oStatsItem instanceof WordStats) ! { ! oStatsDump.append(((WordStats)oStatsItem).getLexeme()); ! } ! ! oStatsDump.append("\n"); } + + System.out.print(oStatsDump); } // Frequency count ! int aiFrequencies[] = new int[this.iOutputPageSize]; int iCurrFrequency = 1; ! for(int i = this.aoSortedStatRefs.length - 1; i > 0; i--) { //Debug.debug("freq: " + iCurrFrequency + ", i=" + i + ", len = " + aoSortedStatRefs.length); ! if(this.aoSortedStatRefs[i].getFrequency() == iCurrFrequency) { // Such a frequency happened before *************** *** 227,234 **** else { ! // First occurence of such a frequency iCurrFrequency = aoSortedStatRefs[i].getFrequency(); ! if(iCurrFrequency < OUTPUT_PAGE_SIZE) { aiFrequencies[iCurrFrequency - 1] = 1; --- 409,416 ---- else { ! // First occurrence of such a frequency iCurrFrequency = aoSortedStatRefs[i].getFrequency(); ! if(iCurrFrequency < this.iOutputPageSize) { aiFrequencies[iCurrFrequency - 1] = 1; *************** *** 239,243 **** ( "WARNING: Occurence of a frequency (" + iCurrFrequency + ") exceeds " ! + "output page size (" + OUTPUT_PAGE_SIZE + "), and, therefore, ignored." ); --- 421,425 ---- ( "WARNING: Occurence of a frequency (" + iCurrFrequency + ") exceeds " ! + "output page size (" + this.iOutputPageSize + "), and, therefore, ignored." ); *************** *** 255,259 **** ); ! for(int i = 0; i < OUTPUT_PAGE_SIZE; i++) { System.out.println((i + 1) + "\t" + aiFrequencies[i]); --- 437,441 ---- ); ! for(int i = 0; i < this.iOutputPageSize; i++) { System.out.println((i + 1) + "\t" + aiFrequencies[i]); *************** *** 263,267 **** /** * Dumps CVS values of the rank and frequency into a file. ! * Filename is composed from the orginal corpus name plus the .csv extension. * By default the dump is in the log() scale. * @throws IOException --- 445,449 ---- /** * Dumps CVS values of the rank and frequency into a file. ! * Filename is composed from the original corpus name plus the .csv extension. * By default the dump is in the log() scale. * @throws IOException *************** *** 449,453 **** * @since 0.3.0.5 */ ! public final WordStats[] getSortedStatRefs() { return this.aoSortedStatRefs; --- 631,636 ---- * @since 0.3.0.5 */ ! public final StatisticalObject[] getSortedStatRefs() ! // public final WordStats[] getSortedStatRefs() { return this.aoSortedStatRefs; *************** *** 511,515 **** .append("Maximum word length: ").append(this.iMaxWordLength).append("\n") .append("Dictionary size: ").append(this.oStats.size()).append("\n") ! .append("WordStats Dictionary:\n") .append(this.oStats); --- 694,698 ---- .append("Maximum word length: ").append(this.iMaxWordLength).append("\n") .append("Dictionary size: ").append(this.oStats.size()).append("\n") ! .append("Stats Dictionary:\n") .append(this.oStats); |