[DL-Learner SVN] SF.net SVN: dl-learner:[2208] trunk/src/dl-learner/org/dllearner

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2208
          http://dl-learner.svn.sourceforge.net/dl-learner/?rev=2208&view=rev
Author:   jenslehmann
Date:     2010-07-21 08:24:55 +0000 (Wed, 21 Jul 2010)

Log Message:
-----------
more unit tests for heuristic approximation

Modified Paths:
--------------
    trunk/src/dl-learner/org/dllearner/learningproblems/ClassLearningProblem.java
    trunk/src/dl-learner/org/dllearner/learningproblems/Heuristics.java
    trunk/src/dl-learner/org/dllearner/test/junit/HeuristicTests.java

Modified: trunk/src/dl-learner/org/dllearner/learningproblems/ClassLearningProblem.java
===================================================================

--- trunk/src/dl-learner/org/dllearner/learningproblems/ClassLearningProblem.java	2010-07-21 05:48:18 UTC (rev 2207)
+++ trunk/src/dl-learner/org/dllearner/learningproblems/ClassLearningProblem.java	2010-07-21 08:24:55 UTC (rev 2208)
@@ -311,7 +311,7 @@
 				testsPerformed++;
 				
 				// check whether approximation is sufficiently accurate
-				double[] approx = Heuristics.getFMeasureApproximation(instancesCovered, recall, coverageFactor, superClassInstances.size(), testsPerformed, instancesDescription);
+				double[] approx = Heuristics.getFScoreApproximation(instancesCovered, recall, coverageFactor, superClassInstances.size(), testsPerformed, instancesDescription);
 				if(approx[1]<approxDelta) {
 					return approx[0];
 				}

Modified: trunk/src/dl-learner/org/dllearner/learningproblems/Heuristics.java
===================================================================
--- trunk/src/dl-learner/org/dllearner/learningproblems/Heuristics.java	2010-07-21 05:48:18 UTC (rev 2207)
+++ trunk/src/dl-learner/org/dllearner/learningproblems/Heuristics.java	2010-07-21 08:24:55 UTC (rev 2208)
@@ -122,7 +122,7 @@
 	 */
 	public static double[] getConfidenceInterval95Wald(int total, int success) {
 		if(success > total || total < 1) {
-			throw new IllegalArgumentException();
+			throw new IllegalArgumentException("95% confidence interval for " + success + " out of " + total + " trials cannot be estimated.");
 		}
 		double[] ret = new double[2];
 		double p1 = (success+2)/(double)(total+4);
@@ -182,7 +182,7 @@
 	 * @return A two element array, where the first element is the computed F-beta score and the
 	 * second element is the length of the 95% confidence interval around it.
 	 */
-	public static double[] getFMeasureApproximation(int nrOfPosClassifiedPositives, double recall, double beta, int nrOfRelevantInstances, int nrOfInstanceChecks, int nrOfSuccessfulInstanceChecks) {
+	public static double[] getFScoreApproximation(int nrOfPosClassifiedPositives, double recall, double beta, int nrOfRelevantInstances, int nrOfInstanceChecks, int nrOfSuccessfulInstanceChecks) {
 		// compute 95% confidence interval
 		double[] interval = Heuristics.getConfidenceInterval95Wald(nrOfInstanceChecks, nrOfSuccessfulInstanceChecks);
 		// multiply by number of instances from which the random samples are drawn
@@ -203,10 +203,21 @@
 		return ret;
 	}
 	
-	public static double[] getAMeasureApproximationStep1(double beta, int nrOfPosExamples, int nrOfInstanceChecks, int nrOfSuccessfulInstanceChecks) {
+	/**
+	 * In the first step of the AScore approximation, we estimate recall (taking the factor
+	 * beta into account). This is not much more than a wrapper around the modified Wald method.
+	 * @param beta Weights precision and recall. If beta is >1, then recall is more important
+	 * than precision.
+	 * @param nrOfPosExamples Number of positive examples (or instances of the considered class).
+	 * @param nrOfInstanceChecks Number of positive examples (or instances of the considered class) which have been checked.
+	 * @param nrOfSuccessfulInstanceChecks Number of positive examples (or instances of the considered class), where the instance check returned true.
+	 * @return A two element array, where the first element is the recall multiplied by beta and the
+	 * second element is the length of the 95% confidence interval around it.
+	 */
+	public static double[] getAScoreApproximationStep1(double beta, int nrOfPosExamples, int nrOfInstanceChecks, int nrOfSuccessfulInstanceChecks) {
 		// the method is just a wrapper around a single confidence interval approximation;
 		// method approximates t * a / |R(A)|
-		double[] interval = Heuristics.getConfidenceInterval95Wald(nrOfInstanceChecks, nrOfSuccessfulInstanceChecks);
+		double[] interval = Heuristics.getConfidenceInterval95Wald(nrOfSuccessfulInstanceChecks, nrOfInstanceChecks);
 		double diff = beta * (interval[1] - interval[0]);
 		double ret[] = new double[2];
 		ret[0] = beta * interval[0] + 0.5*diff;
@@ -214,13 +225,36 @@
 		return ret;
 	}
 	
-	public static double[] getAMeasureApproximationStep2(int nrOfPosClassifiedPositives, double[] recallInterval, double beta, int nrOfRelevantInstances, int nrOfInstanceChecks, int nrOfSuccessfulInstanceChecks) {
-		// TODO: code untested
+	/**
+	 * In step 2 of the A-Score approximation, the precision and overall A-Score is estimated based on
+	 * the estimated recall.
+	 * @param nrOfPosClassifiedPositives Positive examples (instance of a class), which are classified as positives.
+	 * @param recallInterval The estimated recall, which needs to be given as a two element array with the first element being the mean value and the second element being the length of the interval (to be compatible with the step1 method). 
+	 * @param beta Weights precision and recall. If beta is >1, then recall is more important
+	 * than precision.
+	 * @param nrOfRelevantInstances Number of relevant instances, i.e. number of instances, which
+	 * would have been tested without approximations.
+	 * @param nrOfInstanceChecks Performed instance checks for the approximation.
+	 * @param nrOfSuccessfulInstanceChecks Number of performed instance checks, which returned true.
+	 * @return A two element array, where the first element is the estimated A-Score and the
+	 * second element is the length of the 95% confidence interval around it.
+	 */
+	public static double[] getAScoreApproximationStep2(int nrOfPosClassifiedPositives, double[] recallInterval, double beta, int nrOfRelevantInstances, int nrOfInstanceChecks, int nrOfSuccessfulInstanceChecks) {
+		// recall interval is given as mean + interval size (to fit the other method calls) => computer lower and upper border
+		double recallLowerBorder = (recallInterval[0] - 0.5*recallInterval[1]) / beta;
+		double recallUpperBorder = (recallInterval[0] + 0.5*recallInterval[1]) / beta;
+		// estimate precision
 		double[] interval = Heuristics.getConfidenceInterval95Wald(nrOfInstanceChecks, nrOfSuccessfulInstanceChecks);
-		double precisionLowerBorder = nrOfPosClassifiedPositives / interval[1] * nrOfRelevantInstances;
-		double precisionUpperBorder = nrOfPosClassifiedPositives / interval[0] * nrOfRelevantInstances;
-		double lowerBorder = Heuristics.getAScore(recallInterval[0] / beta, precisionLowerBorder, beta);
-		double upperBorder = Heuristics.getAScore(recallInterval[0] / beta, precisionUpperBorder, beta);
+		
+		double precisionLowerBorder = nrOfPosClassifiedPositives / (nrOfPosClassifiedPositives + interval[1] * nrOfRelevantInstances);
+		double precisionUpperBorder = nrOfPosClassifiedPositives / (nrOfPosClassifiedPositives + interval[0] * nrOfRelevantInstances);
+		
+//		System.out.println("rec low: " + recallLowerBorder);
+//		System.out.println("rec up: " + recallUpperBorder);
+//		System.out.println("prec low: " + precisionLowerBorder);
+//		System.out.println("prec up: " + precisionUpperBorder);
+		double lowerBorder = Heuristics.getAScore(recallLowerBorder, precisionLowerBorder, beta);
+		double upperBorder = Heuristics.getAScore(recallUpperBorder, precisionUpperBorder, beta);
 		double diff = upperBorder - lowerBorder;
 		double ret[] = new double[2];
 		ret[0] = lowerBorder + 0.5*diff;

Modified: trunk/src/dl-learner/org/dllearner/test/junit/HeuristicTests.java
===================================================================
--- trunk/src/dl-learner/org/dllearner/test/junit/HeuristicTests.java	2010-07-21 05:48:18 UTC (rev 2207)
+++ trunk/src/dl-learner/org/dllearner/test/junit/HeuristicTests.java	2010-07-21 08:24:55 UTC (rev 2208)
@@ -161,18 +161,34 @@
 	@Test
 	public void approximationTests() {
 		// perform F-Measure example in ontology engineering paper, which was computed on paper
-		// TODO: compute again, because unit tests fails (probably rounding errors)
-		double[] approx1 = Heuristics.getFMeasureApproximation(800, 0.8, 1, 10000, 41, 31);
-		assertEquals(0.0505, approx1[1], delta);
-		double[] approx2 = Heuristics.getFMeasureApproximation(800, 0.8, 1, 10000, 42, 32);
-		assertEquals(0.1699, approx2[0], delta);
-		assertEquals(0.0489, approx2[1], delta);
+		double[] approx1 = Heuristics.getFScoreApproximation(800, 0.8, 1, 10000, 41, 31);
+		// smaller delta, because of rounding errors
+		assertEquals(0.050517, approx1[1], 0.001);
+		double[] approx2 = Heuristics.getFScoreApproximation(800, 0.8, 1, 10000, 42, 32);
+		// 0.1699 in the paper is just current precision divided by  multiplied by relevant instances
+		// 0.1778 is the center of the interval
+		assertEquals(0.178091, approx2[0], 0.001);
+		assertEquals(0.048933, approx2[1], 0.001);
 		
 		// perform A-Measure example in ontology engineering paper
 		// setup: 1000 class instances, 10000 relevant instances, delta=0.10
 		// input1: 90 out of 95 tests => no success para 1, 91 out of 96 => success
 		// input2: using estimation from input 1, 32 out of 64 => success
 		// overall accuracy: 64%
+		double[] approx1Step1 = Heuristics.getAScoreApproximationStep1(1, 1000, 90, 95);
+		assertEquals(0.10006, approx1Step1[1], 0.001);
+		// on paper, it works with 91 out of 96; but in the implementation only with
+		// 92 out of 97 (probably rounding errors)
+		double[] approx2Step1 = Heuristics.getAScoreApproximationStep1(1, 1000, 92, 97);
+		assertTrue(approx2Step1[1] < 0.1);
+		
+		// double[] approxStep2 = Heuristics.getAScoreApproximationStep2(800, new double[] {approx2Step1[0]-0.5*approx2Step1[1], approx2Step1[0]+0.5*approx2Step1[1]}, 1, 10000, 64, 32);
+		// example computed by hand (note that it differs from the paper example in that
+		// we do not use the square root)
+		double[] approxStep2 = Heuristics.getAScoreApproximationStep2(800, approx2Step1, 1, 10000, 64, 32);
+		assertEquals(0.49822461, approxStep2[0]-0.5*approxStep2[1], 0.001);
+		assertEquals(0.5771179, approxStep2[0]+0.5*approxStep2[1], 0.001);
+//		System.out.println(approxStep2[0] + " " + approxStep2[1]);
 	}
 	
 	// the class learning problem provides several ways to get the accuracy of a description, this method


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.