|
From: <jgr...@us...> - 2003-09-03 19:37:41
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv16348/Classifier
Modified Files:
Bayes.pm
Log Message:
Change the way we display the probability numbers for single message view so that there is no normalization going on
Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.181
retrieving revision 1.182
diff -C2 -d -r1.181 -r1.182
*** Bayes.pm 20 Aug 2003 06:12:15 -0000 1.181
--- Bayes.pm 3 Sep 2003 19:37:06 -0000 1.182
***************
*** 664,683 ****
# P(word|bucket) ^ word count and multiply to the score
- my $logbuck = 1;
- $logbuck = log( $#buckets + 1 ) if ( $#buckets > 0 );
-
- # Ideally, the "raw score" in the score display would reflect the sum of the
- # scores for the individual words, as shown by the lookup GUI. Actually
- # doing this requires a fair amount of computation to compute the sum of the
- # probabilities. If we assume that only the most probable choice is significant
- # (that is, that the max probability and the sum of the probabilities are the
- # same), we do much less computation, and still end up with results that are
- # "close enough for jazz". Note that this makes *no* difference for
- # classification - it only matters for the debug (bayes.pl) display.
-
- my $correction = -$logbuck;
-
- # Switching from using *= to += and using the log of every probability instead
-
foreach my $word (keys %{$self->{parser__}->{words__}}) {
my $wmax = -10000;
--- 664,667 ----
***************
*** 695,704 ****
$score{$bucket} += ( $probability * $self->{parser__}{words__}{$word} );
}
-
- if ($wmax > $self->{not_likely__}) {
- $correction += ($wmax - $logbuck) * $self->{parser__}{words__}{$word};
- } else {
- $correction += $wmax * $self->{parser__}{words__}{$word};
- }
}
--- 679,682 ----
***************
*** 707,729 ****
my @ranking = sort {$score{$b} <=> $score{$a}} keys %score;
- my %raw_score;
- my $base_score = $score{$ranking[0]};
- my $total = 0;
-
- $self->log_( "Base score is $base_score for $ranking[0]" );
-
- # Compute the total of all the scores to generate the normalized scores and probability
- # estimate. $total is always 1 after the first loop iteration, so any additional term
- # less than 2 ** -54 is insignificant, and need not be computed.
-
- my $ln2 = log(2);
-
- foreach my $b (@ranking) {
- $raw_score{$b} = $score{$b};
- $score{$b} -= $base_score;
-
- $total += exp($score{$b}) if ($score{$b} > ( -54 * $ln2 ) );
- }
-
if ($self->{wordscores__} && defined($ui) ) {
my %qm = %{$self->{parser__}->quickmagnets()};
--- 685,688 ----
***************
*** 774,787 ****
foreach my $b (@ranking) {
- my $prob = exp($score{$b})/$total;
- my $probstr;
! if ($prob >= 0.1 || $prob == 0.0) {
! $probstr = sprintf("%12.6f", $prob);
! } else {
! $probstr = sprintf("%17.6e", $prob);
! }
! $self->{scores__} .= "<tr>\n<td><font color=\"$self->{colors__}{$b}\"><b>$b</b></font></td>\n<td> </td>\n<td align=\"right\">$matchcount{$b} </td>\n<td>$probstr</td>\n</tr>\n";
}
--- 733,747 ----
foreach my $b (@ranking) {
! # Take a score value (which is log of the probability) and write it out as 0.000000 lots 00000001234, to do this we
! # calculate the number of 0 between the . and the first significant digit and output the number of zeroes and
! # then the significant digits
! my $zero_count = -int($score{$b}/log(10));
! my $significant = sprintf( "%.6f", exp($score{$b} + $zero_count * log(10)) );
! $significant =~ s/^0\.//;
! my $probstr = sprintf( "0. [%d zeroes] %s", $zero_count, $significant );
!
! $self->{scores__} .= "<tr>\n<td><font color=\"$self->{colors__}{$b}\"><b>$b</b></font></td>\n<td> </td>\n<td align=\"right\">$matchcount{$b} </td>\n<td>$probstr</td>\n</tr>\n";
}
***************
*** 846,852 ****
my $class = 'unclassified';
! if ( ( $total != 0 ) && ( $score{$ranking[0]} > $self->{unclassified__} + log($total) ) ) {
$class = $ranking[0];
! }
return $class;
--- 806,812 ----
my $class = 'unclassified';
! # if ( ( $total != 0 ) && ( $score{$ranking[0]} > $self->{unclassified__} + log($total) ) ) {
$class = $ranking[0];
! # }
return $class;
|