[marf-cvs] marf/src/marf/Classification/Stochastic Stochastic.java, 1.25, 1.25.4.1 ZipfLaw.java, 1.

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Update of /cvsroot/marf/marf/src/marf/Classification/Stochastic
In directory sc8-pr-cvs16.sourceforge.net:/tmp/cvs-serv9497/src/marf/Classification/Stochastic

Modified Files:
      Tag: DISTRIBUTED_MARF_0_3_0_INTEGRATION
	Stochastic.java ZipfLaw.java 
Log Message:
Consolidate most of the differences between the MAIN
and Distributed MARF branches. This primarily includes
the copyright year update to 2008, SpeakerIdentApp
with the new options of loaders and the BandStopFilter,
ZipfLaw, CosineSimilarityMeasure, and HammingDistance.
Include stats per config along with stats per subject.
Add timing measurements.
A lot of comment spell checks and other corrections.
Generalize some of the code, fix naming conventions in
places. Fix some bugs that popped up in the main branch
down to DMARF. Add some new implementation developments,
such as nlp.Storage and Classification.Similarity.
Refactor Configuration and some other classes. Make
WAVLoader more customizable, in particular being able
to load 16000 Hz samples. Many other fixes that will be
summarized in the MAIN's ChangeLog.


Index: Stochastic.java
===================================================================
RCS file: /cvsroot/marf/marf/src/marf/Classification/Stochastic/Stochastic.java,v
retrieving revision 1.25
retrieving revision 1.25.4.1
diff -C2 -d -r1.25 -r1.25.4.1
*** Stochastic.java	31 Jul 2006 02:19:13 -0000	1.25
--- Stochastic.java	9 Apr 2008 00:32:11 -0000	1.25.4.1
***************
*** 12,16 ****
   * <p>TODO: partially implemented.</p>
   *
!  * <p>$Id$</p>
   *
   * @author Serguei Mokhov
--- 12,16 ----
   * <p>TODO: partially implemented.</p>
   *
!  * $Id$
   *
   * @author Serguei Mokhov

Index: ZipfLaw.java
===================================================================
RCS file: /cvsroot/marf/marf/src/marf/Classification/Stochastic/ZipfLaw.java,v
retrieving revision 1.30
retrieving revision 1.30.4.1
diff -C2 -d -r1.30 -r1.30.4.1
*** ZipfLaw.java	3 Sep 2006 20:56:01 -0000	1.30
--- ZipfLaw.java	9 Apr 2008 00:32:12 -0000	1.30.4.1
***************
*** 8,13 ****
--- 8,16 ----
  
  import marf.Classification.ClassificationException;
+ import marf.Classification.Distance.DiffDistance;
  import marf.FeatureExtraction.IFeatureExtraction;
+ import marf.Stats.StatisticalObject;
  import marf.Stats.WordStats;
+ import marf.Storage.Result;
  import marf.Storage.StorageException;
  import marf.util.SortComparator;
***************
*** 31,35 ****
  	 * @since 0.3.0.5
  	 */
! 	public static final int OUTPUT_PAGE_SIZE = 100;
  
  	/**
--- 34,38 ----
  	 * @since 0.3.0.5
  	 */
! 	public static final int DEFAULT_OUTPUT_PAGE_SIZE = 100;
  
  	/**
***************
*** 40,45 ****
  	/**
  	 * Sorted references to stats.
  	 */
! 	private WordStats[] aoSortedStatRefs = null;
  
  	/**
--- 43,51 ----
  	/**
  	 * Sorted references to stats.
+ 	 * As of 0.3.0.6 was set to the base type StatisticalObject
+ 	 * instead of WordStats to allow other than word elements. 
  	 */
! //	private WordStats[] aoSortedStatRefs = null;
! 	private StatisticalObject[] aoSortedStatRefs = null;
  
  	/**
***************
*** 61,64 ****
--- 67,77 ----
  
  	/**
+ 	 * When the results are dumped in the text mode, tell how
+ 	 * many records to show per page.
+ 	 * @since 0.3.0.6
+ 	 */
+ 	private int iOutputPageSize = DEFAULT_OUTPUT_PAGE_SIZE;
+ 	
+ 	/**
  	 * For serialization versioning.
  	 * When adding new members or make other structural
***************
*** 84,92 ****
  	/**
  	 * Classification API.
! 	 * @param poFeatureExtraction preprcessing module to get the data from
  	 */
  	public ZipfLaw(IFeatureExtraction poFeatureExtraction)
  	{
  		super(poFeatureExtraction);
  	}
  
--- 97,263 ----
  	/**
  	 * Classification API.
! 	 * @param poFeatureExtraction preprocessing module to get the data from
  	 */
  	public ZipfLaw(IFeatureExtraction poFeatureExtraction)
  	{
  		super(poFeatureExtraction);
+ 		this.strFilename = getTrainingSetFilename().replaceAll("marf.Storage.TrainingSet", getClass().getName());
+ 		this.oStats = new Hashtable();
+ 		this.oObjectToSerialize = this;
+ 	}
+ 	
+ 	/**
+ 	 * @since 0.3.0.6
+ 	 * @see marf.Classification.IClassification#classify(double[])
+ 	 */
+ 	public boolean classify(double[] padFeatureVector)
+ 	throws ClassificationException
+ 	{
+ 		try
+ 		{
+ 			// Unseen data
+ 			collectStatistics(padFeatureVector);
+ 			
+ 			// Back up its stats
+ 			Hashtable oBackupStats = (Hashtable)this.oStats.clone();
+ 			StatisticalObject[] aoSortedDataBackup = (StatisticalObject[])this.aoSortedStatRefs.clone();
+ 			
+ 			// Restore data from the training set
+ 			restore();
+ 			
+ 			double[] adUnseenData = new double[aoSortedDataBackup.length];
+ 			double[] adSeenData = new double[this.aoSortedStatRefs.length];
+ 			
+ 			// The unseen and trained vectors have to have identical
+ 			// observations on the LHS. The ones that are missing on
+ 			// either one get a frequency of zero. This is required
+ 			// for meaningful component-wise distance comparison
+ 			// afterwards as raw percentages will not do that.
+ 			// This can be approximated with the DiffDistance classifier
+ 			// as a temporary workaround
+ 			// XXX
+ 			
+ 			// Compute totals prior conversion to percentages
+ 			int iUnseenObservationsTotal = 0;
+ 			int iTrainingObservationsTotal = 0;
+ 			
+ 			for(int i = 0; i < aoSortedDataBackup.length; i++)
+ 			{
+ 				iUnseenObservationsTotal += aoSortedDataBackup[i].getFrequency();
+ 			}
+ 
+ 			for(int i = 0; i < this.aoSortedStatRefs.length; i++)
+ 			{
+ 				iTrainingObservationsTotal += this.aoSortedStatRefs[i].getFrequency();
+ 			}
+ 
+ 			// Convert unseen and stored data to just plain double[] arrays
+ 			for(int i = 0; i < aoSortedDataBackup.length; i++)
+ 			{
+ 				adUnseenData[i] = (double)aoSortedDataBackup[i].getFrequency() / iUnseenObservationsTotal;
+ 			}
+ 
+ 			for(int i = 0; i < this.aoSortedStatRefs.length; i++)
+ 			{
+ 				adSeenData[i] = (double)this.aoSortedStatRefs[i].getFrequency() / iTrainingObservationsTotal;
+ 			}
+ 			
+ 			// Compare the unseen and stored data using a specified
+ 			// Distance classifier
+ 			// XXX
+ 			//DiffDistance oDistance = ClassifcationFactory.create(this.iDistanceMethod);
+ 			DiffDistance oDistance = new DiffDistance(null);
+ 			double dDistance = oDistance.distance(adSeenData, adUnseenData);
+ 			this.oResultSet.addResult(1, dDistance);
+ 			
+ 			return true;
+ 		}
+ 		catch(ClassificationException e)
+ 		{
+ 			throw e;
+ 		}
+ 		catch(Exception e)
+ 		{
+ 			throw new ClassificationException(e);
+ 		}
+ 	}
+ 
+ 	/**
+ 	 * @since 0.3.0.6
+ 	 * @see marf.Classification.IClassification#train(double[])
+ 	 */
+ 	public boolean train(double[] padFeatureVector)
+ 	throws ClassificationException
+ 	{
+ 		try
+ 		{
+ 			restore();
+ 			collectStatistics(padFeatureVector);
+ 			dump();
+ 			
+ 			return true;
+ 		}
+ 		catch(ClassificationException e)
+ 		{
+ 			throw e;
+ 		}
+ 		catch(Exception e)
+ 		{
+ 			throw new ClassificationException(e);
+ 		}
+ 	}
+ 
+ 	/**
+ 	 * @since 0.3.0.6
+ 	 * @see marf.Classification.IClassification#getResult()
+ 	 */
+ 	public Result getResult()
+ 	{
+ 		// TODO Auto-generated method stub
+ 		return super.getResult();
+ 	}
+ 
+ 	/**
+ 	 * Collects result statistics.
+ 	 * TODO: employ StatsCollector.
+ 	 * @param padFeatures desired stream tokenizer
+ 	 * @throws ClassificationException in case of inner exceptions
+ 	 */
+ 	public final void collectStatistics(double[] padFeatures)
+ 	throws ClassificationException
+ 	{
+ 		try
+ 		{
+ 			// All items are one unit in length
+ 			this.iMinWordLength = this.iMaxWordLength = 1;
+ 
+ 			// Collect Stats
+ 			for(int i = 0; i < padFeatures.length; i++)
+ 			{
+ 				// Look up if we already have an entry for this.
+ 				StatisticalObject oFeatureStats = (StatisticalObject)this.oStats.get(new Double(padFeatures[i]));
+ 
+ 				// New entry
+ 				if(oFeatureStats == null)
+ 				{
+ 					oFeatureStats = new StatisticalObject(1);
+ 					this.oStats.put(new Double(padFeatures[i]), oFeatureStats);
+ 				}
+ 
+ 				// Update existing entry
+ 				else
+ 				{
+ 					oFeatureStats.incFrequency();
+ 				}
+ 			}
+ 
+ 			// Sort and assign ranks
+ 			sort();
+ 			rankAll();
+ 		}
+ 		catch(RuntimeException e)
+ 		{
+ 			throw new ClassificationException(e);
+ 		}
  	}
  
***************
*** 161,165 ****
  	private void sort()
  	{
! 		this.aoSortedStatRefs = (WordStats[])oStats.values().toArray(new WordStats[0]);
  		marf.util.Arrays.sort(aoSortedStatRefs, new FrequencyComparator(SortComparator.DESCENDING));
  	}
--- 332,337 ----
  	private void sort()
  	{
! //		this.aoSortedStatRefs = (WordStats[])oStats.values().toArray(new WordStats[0]);
! 		this.aoSortedStatRefs = (StatisticalObject[])oStats.values().toArray(new StatisticalObject[0]);
  		marf.util.Arrays.sort(aoSortedStatRefs, new FrequencyComparator(SortComparator.DESCENDING));
  	}
***************
*** 170,176 ****
  	private final void rankAll()
  	{
! 		for(int i = 0; i < aoSortedStatRefs.length; i++)
  		{
! 			aoSortedStatRefs[i].setRank(i + 1);
  		}
  	}
--- 342,348 ----
  	private final void rankAll()
  	{
! 		for(int i = 0; i < this.aoSortedStatRefs.length; i++)
  		{
! 			this.aoSortedStatRefs[i].setRank(i + 1);
  		}
  	}
***************
*** 183,187 ****
  		System.out.println("f = Frequency, r = Rank");
  
! 		for(int i = 0; i < aoSortedStatRefs.length; i += 10 * OUTPUT_PAGE_SIZE)
  		{
  			System.out.println
--- 355,359 ----
  		System.out.println("f = Frequency, r = Rank");
  
! 		for(int i = 0; i < this.aoSortedStatRefs.length; i += 10 * this.iOutputPageSize)
  		{
  			System.out.println
***************
*** 189,193 ****
  				"\n" +
  				"---------------------------------\n" +
! 				"Words from " + (i + 1) + " to " + (i + OUTPUT_PAGE_SIZE) + "\n" +
  				"---------------------------------\n\n"
  			);
--- 361,365 ----
  				"\n" +
  				"---------------------------------\n" +
! 				"Words from " + (i + 1) + " to " + (i + this.iOutputPageSize) + "\n" +
  				"---------------------------------\n\n"
  			);
***************
*** 195,224 ****
  			System.out.println("Columns: r, f, f*r, word");
  
  			for
  			(
  				int j = 0;
! 				j < (aoSortedStatRefs.length - i > OUTPUT_PAGE_SIZE ? OUTPUT_PAGE_SIZE : aoSortedStatRefs.length - i);
  				j++
  			)
  			{
! 				System.out.println
! 				(
! 					aoSortedStatRefs[i + j].getRank() + "\t" +
! 					aoSortedStatRefs[i + j].getFrequency() + "\t" +
! 					aoSortedStatRefs[i + j].getFrequency() * aoSortedStatRefs[i + j].getRank() + "\t" +
! 					aoSortedStatRefs[i + j].getLexeme()
! 				);
  			}
  		}
  
  		// Frequency count
! 		int aiFrequencies[] = new int[OUTPUT_PAGE_SIZE];
  		int iCurrFrequency = 1;
  
! 		for(int i = aoSortedStatRefs.length - 1; i > 0; i--)
  		{
  			//Debug.debug("freq: " + iCurrFrequency + ", i=" + i + ", len = " + aoSortedStatRefs.length);
  
! 			if(aoSortedStatRefs[i].getFrequency() == iCurrFrequency)
  			{
  				// Such a frequency happened before
--- 367,406 ----
  			System.out.println("Columns: r, f, f*r, word");
  
+ 			StringBuffer oStatsDump = new StringBuffer();
+ 			
  			for
  			(
  				int j = 0;
! 				j < (this.aoSortedStatRefs.length - i > this.iOutputPageSize ? this.iOutputPageSize : this.aoSortedStatRefs.length - i);
  				j++
  			)
  			{
! 				StatisticalObject oStatsItem = this.aoSortedStatRefs[i + j];
! 
! 				oStatsDump
! 					.append(oStatsItem.getRank()).append("\t")
! 					.append(oStatsItem.getFrequency()).append("\t")
! 					.append(oStatsItem.getFrequency() * oStatsItem.getRank()).append("\t");
! 
! 				if(oStatsItem instanceof WordStats)
! 				{
! 					oStatsDump.append(((WordStats)oStatsItem).getLexeme());
! 				}
! 				
! 				oStatsDump.append("\n");
  			}
+ 			
+ 			System.out.print(oStatsDump);
  		}
  
  		// Frequency count
! 		int aiFrequencies[] = new int[this.iOutputPageSize];
  		int iCurrFrequency = 1;
  
! 		for(int i = this.aoSortedStatRefs.length - 1; i > 0; i--)
  		{
  			//Debug.debug("freq: " + iCurrFrequency + ", i=" + i + ", len = " + aoSortedStatRefs.length);
  
! 			if(this.aoSortedStatRefs[i].getFrequency() == iCurrFrequency)
  			{
  				// Such a frequency happened before
***************
*** 227,234 ****
  			else
  			{
! 				// First occurence of such a frequency
  				iCurrFrequency = aoSortedStatRefs[i].getFrequency();
  				
! 				if(iCurrFrequency < OUTPUT_PAGE_SIZE)
  				{
  					aiFrequencies[iCurrFrequency - 1] = 1;
--- 409,416 ----
  			else
  			{
! 				// First occurrence of such a frequency
  				iCurrFrequency = aoSortedStatRefs[i].getFrequency();
  				
! 				if(iCurrFrequency < this.iOutputPageSize)
  				{
  					aiFrequencies[iCurrFrequency - 1] = 1;
***************
*** 239,243 ****
  					(
  						"WARNING: Occurence of a frequency (" + iCurrFrequency + ") exceeds "
! 						+ "output page size (" + OUTPUT_PAGE_SIZE + "), and, therefore, ignored."
  					);
  
--- 421,425 ----
  					(
  						"WARNING: Occurence of a frequency (" + iCurrFrequency + ") exceeds "
! 						+ "output page size (" + this.iOutputPageSize + "), and, therefore, ignored."
  					);
  
***************
*** 255,259 ****
  		);
  
! 		for(int i = 0; i < OUTPUT_PAGE_SIZE; i++)
  		{
  			System.out.println((i + 1) + "\t" + aiFrequencies[i]);
--- 437,441 ----
  		);
  
! 		for(int i = 0; i < this.iOutputPageSize; i++)
  		{
  			System.out.println((i + 1) + "\t" + aiFrequencies[i]);
***************
*** 263,267 ****
  	/**
  	 * Dumps CVS values of the rank and frequency into a file.
! 	 * Filename is composed from the orginal corpus name plus the .csv extension.
  	 * By default the dump is in the log() scale.
  	 * @throws IOException
--- 445,449 ----
  	/**
  	 * Dumps CVS values of the rank and frequency into a file.
! 	 * Filename is composed from the original corpus name plus the .csv extension.
  	 * By default the dump is in the log() scale.
  	 * @throws IOException
***************
*** 449,453 ****
  	 * @since 0.3.0.5
  	 */
! 	public final WordStats[] getSortedStatRefs()
  	{
  		return this.aoSortedStatRefs;
--- 631,636 ----
  	 * @since 0.3.0.5
  	 */
! 	public final StatisticalObject[] getSortedStatRefs()
! //	public final WordStats[] getSortedStatRefs()
  	{
  		return this.aoSortedStatRefs;
***************
*** 511,515 ****
  			.append("Maximum word length: ").append(this.iMaxWordLength).append("\n")
  			.append("Dictionary size: ").append(this.oStats.size()).append("\n")
! 			.append("WordStats Dictionary:\n")
  			.append(this.oStats);
  
--- 694,698 ----
  			.append("Maximum word length: ").append(this.iMaxWordLength).append("\n")
  			.append("Dictionary size: ").append(this.oStats.size()).append("\n")
! 			.append("Stats Dictionary:\n")
  			.append(this.oStats);

[marf-cvs] marf/src/marf/Classification/Stochastic Stochastic.java, 1.25, 1.25.4.1 ZipfLaw.java, 1.

[marf-cvs] marf/src/marf/Classification/Stochastic Stochastic.java, 1.25, 1.25.4.1 ZipfLaw.java, 1.30, 1.30.4.1