|
From: <jgr...@us...> - 2003-09-03 21:53:15
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv12610/Classifier
Modified Files:
Bayes.pm
Log Message:
Use chi2 test taken similar to SpamBayes to implement an unsure feature, when POPFile is close on the top two buckets it will mark an email as unsure instead of picking the top classification
Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.183
retrieving revision 1.184
diff -C2 -d -r1.183 -r1.184
*** Bayes.pm 3 Sep 2003 20:47:47 -0000 1.183
--- Bayes.pm 3 Sep 2003 21:53:11 -0000 1.184
***************
*** 589,592 ****
--- 589,621 ----
# ---------------------------------------------------------------------------------------------
#
+ # chi2
+ #
+ # $val The value on which we do the chi2 test
+ # $free Number of degrees of freedom
+ # $modifier log() of a power of 10 to make values come in range
+ #
+ # Performs a chi-squared calculation on the passed in log(probability), liberally inspired
+ # by code in SpamBayes and work by Gary Robinson
+ #
+ # ---------------------------------------------------------------------------------------------
+
+ sub chi2
+ {
+ my ( $val, $free, $modifier ) = @_;
+
+ my $m = $val + $modifier;
+ my $sum = exp(-$m);
+ my $term = $sum;
+
+ for my $i (1..$free/2) {
+ $term *= $m / $i;
+ $sum += $term;
+ }
+
+ return ($sum < 1)?$sum:1;
+ }
+
+ # ---------------------------------------------------------------------------------------------
+ #
# classify
#
***************
*** 658,661 ****
--- 687,691 ----
my %score;
my %matchcount;
+ my %chi;
for my $bucket (@buckets) {
***************
*** 688,691 ****
--- 718,739 ----
my @ranking = sort {$score{$b} <=> $score{$a}} keys %score;
+ foreach my $bucket (@buckets) {
+ $chi{$bucket} = chi2( $score{$bucket}, 2 * $matchcount{$bucket}, -int($score{$ranking[0]}/log(10)) * log(10) );
+ }
+
+ # If no bucket has a probability better than 0.5, call the message "unclassified".
+ my $class = 'unclassified';
+
+ if ( $score{$ranking[0]} > ( $score{$ranking[1]} + $self->{unclassified__} ) ) {
+ $class = $ranking[0];
+ }
+
+ # Now take a look at the top two chi tests, if they are close to each other then
+ # we are unsure
+
+ if ( ( $chi{$ranking[0]} * 0.5 ) < $chi{$ranking[1]} ) {
+ $class = 'unsure';
+ }
+
if ($self->{wordscores__} && defined($ui) ) {
my %qm = %{$self->{parser__}->quickmagnets()};
***************
*** 732,736 ****
}
! $self->{scores__} .= "<hr><b>$language{Scores}</b><p>\n<table class=\"top20Words\">\n<tr>\n<th scope=\"col\">$language{Bucket}</th>\n<th> </th>\n";
$self->{scores__} .= "<th scope=\"col\">$language{Count} </th><th scope=\"col\">$language{Probability}</th></tr>\n";
--- 780,784 ----
}
! $self->{scores__} .= "<hr><b>$language{Scores}</b><p>\n<b>Verdict: <font color=\"$self->{colors__}{$class}\">$class</font></b><p>\n<table class=\"top20Words\">\n<tr>\n<th scope=\"col\">$language{Bucket}</th>\n<th> </th>\n";
$self->{scores__} .= "<th scope=\"col\">$language{Count} </th><th scope=\"col\">$language{Probability}</th></tr>\n";
***************
*** 804,814 ****
$self->{scores__} .= "</table></p>";
- }
-
- # If no bucket has a probability better than 0.5, call the message "unclassified".
- my $class = 'unclassified';
-
- if ( $score{$ranking[0]} > ( $score{$ranking[1]} + $self->{unclassified__} ) ) {
- $class = $ranking[0];
}
--- 852,855 ----
|