|
From: <jgr...@us...> - 2003-09-22 13:07:14
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv18165/Classifier
Modified Files:
Bayes.pm
Log Message:
Merge Alan Beale's patch that improves the display of word probabilities
Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.198
retrieving revision 1.199
diff -C2 -d -r1.198 -r1.199
*** Bayes.pm 19 Sep 2003 20:18:52 -0000 1.198
--- Bayes.pm 22 Sep 2003 13:06:39 -0000 1.199
***************
*** 63,66 ****
--- 63,69 ----
$self->{wordscores__} = 0;
+ # Choice for the format of the "word matrix" display.
+ $self->{wmformat__} = 'prob';
+
# Just our hostname
$self->{hostname__} = '';
***************
*** 846,851 ****
--- 849,864 ----
my $word_count = 0;
+ # The correction value is used to generate score displays in the scores__
+ # variable which are consistent with the word scores shown by the GUI's
+ # word lookup feature. It is computed to make the contribution of a word
+ # which is unrepresented in a bucket zero. This correction affects only
+ # the values displayed in scores__; it has no effect on the classification
+ # process.
+
+ my $correction = 0;
+
foreach my $word (keys %{$self->{parser__}->{words__}}) {
$word_count += 2;
+ my $wmax = -10000;
foreach my $bucket (@buckets) {
***************
*** 854,857 ****
--- 867,871 ----
$matchcount{$bucket} += $self->{parser__}{words__}{$word} if ($probability != 0);
$probability = $self->{not_likely__} if ( $probability == 0 );
+ $wmax = $probability if ( $wmax < $probability );
# Here we are doing the bayes calculation: P(word|bucket) is in probability
***************
*** 860,863 ****
--- 874,883 ----
$score{$bucket} += ( $probability * $self->{parser__}{words__}{$word} );
}
+
+ if ($wmax > $self->{not_likely__}) {
+ $correction += $self->{not_likely__} * $self->{parser__}{words__}{$word};
+ } else {
+ $correction += $wmax * $self->{parser__}{words__}{$word};
+ }
}
***************
*** 866,890 ****
my @ranking = sort {$score{$b} <=> $score{$a}} keys %score;
foreach my $bucket (@buckets) {
$chi{$bucket} = chi2( $score{$bucket}, $word_count, -int($score{$ranking[0]}/log(10)) * log(10) );
}
! # If no bucket has a probability better than 0.5, call the message "unclassified".
my $class = 'unclassified';
! if ( $score{$ranking[0]} > ( $score{$ranking[1]} + $self->{unclassified__} ) ) {
$class = $ranking[0];
}
# Now take a look at the top two chi tests, if they are close to each other then
! # we are unsure
my $c0 = 1.0 - $chi{$ranking[0]};
my $c1 = 1.0 - $chi{$ranking[1]};
- my $certainty = ($c1-$c0 + 1) / 2;
$class = 'unsure' if ( $certainty < 0.4 );
if ($self->{wordscores__} && defined($ui) ) {
my %qm = %{$self->{parser__}->quickmagnets()};
--- 886,935 ----
my @ranking = sort {$score{$b} <=> $score{$a}} keys %score;
+ my %raw_score;
+ my $base_score = $score{$ranking[0]};
+ my $total = 0;
+
foreach my $bucket (@buckets) {
$chi{$bucket} = chi2( $score{$bucket}, $word_count, -int($score{$ranking[0]}/log(10)) * log(10) );
}
! # If the first and second bucket are too close in their probabilities, call the message
! # unclassified. Also if there are fewer than 2 buckets.
my $class = 'unclassified';
! if ( @buckets > 1 && $score{$ranking[0]} > ( $score{$ranking[1]} + $self->{unclassified__} ) ) {
$class = $ranking[0];
}
# Now take a look at the top two chi tests, if they are close to each other then
! # we are unsure. If there are fewer than two buckets, the message is unclassified,
! # and there is no point to looking at the chi result.
+ my $certainty;
+ if (@buckets > 1) {
my $c0 = 1.0 - $chi{$ranking[0]};
my $c1 = 1.0 - $chi{$ranking[1]};
+ $certainty = ($c1-$c0 + 1) / 2;
+ } else {
+ $certainty = 1.0;
+ }
$class = 'unsure' if ( $certainty < 0.4 );
+
+ # Compute the total of all the scores to generate the normalized scores and probability
+ # estimate. $total is always 1 after the first loop iteration, so any additional term
+ # less than 2 ** -54 is insignificant, and need not be computed.
+
+ my $ln2p_54 = -54 * log(2);
+
+ foreach my $b (@ranking) {
+ $raw_score{$b} = $score{$b};
+ $score{$b} -= $base_score;
+
+ $total += exp($score{$b}) if ($score{$b} > $ln2p_54 );
+ }
+
if ($self->{wordscores__} && defined($ui) ) {
my %qm = %{$self->{parser__}->quickmagnets()};
***************
*** 930,951 ****
}
! $self->{scores__} .= "<hr><b>$language{Scores}</b><p>\n<b>Verdict: <font color=\"$self->{colors__}{$class}\">$class ($certainty $chi{$ranking[0]} $chi{$ranking[1]})</font></b><p>\n<table class=\"top20Words\">\n<tr>\n<th scope=\"col\">$language{Bucket}</th>\n<th> </th>\n";
$self->{scores__} .= "<th scope=\"col\">$language{Count} </th><th scope=\"col\">$language{Probability}</th></tr>\n";
foreach my $b (@ranking) {
! # Take a score value (which is log of the probability) and write it out as 0.000000 lots 00000001234, to do this we
! # calculate the number of 0 between the . and the first significant digit and output the number of zeroes and
! # then the significant digits
! my $zero_count = -int($score{$b}/log(10));
! my $significant = sprintf( "%.6f", exp($score{$b} + $zero_count * log(10)) );
! $significant =~ s/^0\.//;
! my $probstr = sprintf( "0. [%d zeroes] %s", $zero_count, $significant );
$self->{scores__} .= "<tr>\n<td><font color=\"$self->{colors__}{$b}\"><b>$b</b></font></td>\n<td> </td>\n<td align=\"right\">$matchcount{$b} </td>\n<td>$probstr</td>\n</tr>\n";
}
$self->{scores__} .= "</table><hr>";
$self->{scores__} .= "<table class=\"top20Words\">\n";
$self->{scores__} .= "<tr>\n<th scope=\"col\">$language{Word}</th><th> </th><th scope=\"col\">$language{Count}</th><th> </th>\n";
--- 975,1027 ----
}
! $self->{scores__} .= "<a name=\"scores\">";
!
! # If there are fewer than 2 buckets, there is no "verdict " to mention.
! if (@buckets > 1) {
! $self->{scores__} .= "<hr><b>$language{Scores}</b><p>\n<b>Verdict: <font color=\"$self->{colors__}{$class}\">$class ($certainty $chi{$ranking[0]} $chi{$ranking[1]})</font></b><p>\n";
! } else {
! $self->{scores__} .= "<hr><b>$language{Scores}</b><p>\n";
! }
! $self->{scores__} .= "<table class=\"top20Words\">\n<tr>\n<th scope=\"col\">$language{Bucket}</th>\n<th> </th>\n";
! if ($self->{wmformat__} eq 'score') {
! $self->{scores__} .= "<th scope=\"col\">$language{Count} </th><th scope=\"col\" align=\"center\">$language{Score}</th><th scope=\"col\">$language{Probability}</th></tr>\n";
! } else {
$self->{scores__} .= "<th scope=\"col\">$language{Count} </th><th scope=\"col\">$language{Probability}</th></tr>\n";
+ }
+
+ my $log10 = log(10.0);
foreach my $b (@ranking) {
+ my $prob = exp($score{$b})/$total;
+ my $probstr;
+ my $rawstr;
! # If the computed probability would display as 1, display it as .999999 instead.
! # We don't want to give the impression that POPFile is ever completely sure of its
! # classification.
! if ($prob >= .999999) {
! $probstr = sprintf("%12.6f", 0.999999);
! } elsif ($prob >= 0.1 || $prob == 0.0) {
! $probstr = sprintf("%12.6f", $prob);
! } else {
! $probstr = sprintf("%17.6e", $prob);
! }
+ if ($self->{wmformat__} eq 'score') {
+ $rawstr = sprintf("%12.6f", ($raw_score{$b} - $correction)/$log10);
+ $self->{scores__} .= "<tr>\n<td><font color=\"$self->{colors__}{$b}\"><b>$b</b></font></td>\n<td> </td>\n<td align=\"right\">$matchcount{$b} </td>\n<td align=right>$rawstr </td>\n<td>$probstr</td>\n</tr>\n";
+ } else {
$self->{scores__} .= "<tr>\n<td><font color=\"$self->{colors__}{$b}\"><b>$b</b></font></td>\n<td> </td>\n<td align=\"right\">$matchcount{$b} </td>\n<td>$probstr</td>\n</tr>\n";
}
+ }
$self->{scores__} .= "</table><hr>";
+
+ # We want a link to change the format here. But only the UI knows how to build
+ # that link. So we just insert a comment which can be replaced by the UI. There's
+ # probably a better way.
+
+ $self->{scores__} .= "<!--format--><p>";
$self->{scores__} .= "<table class=\"top20Words\">\n";
$self->{scores__} .= "<tr>\n<th scope=\"col\">$language{Word}</th><th> </th><th scope=\"col\">$language{Count}</th><th> </th>\n";
***************
*** 959,963 ****
$self->{scores__} .= "</tr>";
! my @ranked_words = sort {$self->get_sort_value_( $ranking[0], $b ) <=> $self->get_sort_value_( $ranking[0], $a )} keys %{$self->{parser__}->{words__}};
foreach my $word (@ranked_words) {
--- 1035,1063 ----
$self->{scores__} .= "</tr>";
! my %wordprobs;
!
! # If the word matrix is supposed to show probabilities, compute them,
! # saving the results in %wordprobs.
!
! if ( $self->{wmformat__} eq 'prob') {
! foreach my $word (keys %{$self->{parser__}->{words__}}) {
! my $sumfreq = 0;
! my %wval;
! foreach my $bucket (@ranking) {
! $wval{$bucket} = exp(get_sort_value_( $self, $bucket, $word ));
! $sumfreq += $wval{$bucket};
! }
! foreach my $bucket (@ranking) {
! $wordprobs{$bucket,$word} = $wval{$bucket} / $sumfreq;
! }
! }
! }
!
! my @ranked_words;
! if ($self->{wmformat__} eq 'prob') {
! @ranked_words = sort {$wordprobs{$ranking[0],$b} <=> $wordprobs{$ranking[0],$a}} keys %{$self->{parser__}->{words__}};
! } else {
! @ranked_words = sort {$self->get_sort_value_( $ranking[0], $b ) <=> $self->get_sort_value_( $ranking[0], $a )} keys %{$self->{parser__}->{words__}};
! }
foreach my $word (@ranked_words) {
***************
*** 989,993 ****
if ( $probability != 0 ) {
! my $wordprobstr = sprintf("%12.4f", exp($probability) );
$self->{scores__} .= "<td><font color=\"$color\">$wordprobstr</font></td>\n<td> </td>\n";
--- 1089,1100 ----
if ( $probability != 0 ) {
! my $wordprobstr;
! if ($self->{wmformat__} eq 'score') {
! $wordprobstr = sprintf("%12.4f", ($probability - $self->{not_likely__})/$log10 );
! } elsif ($self->{wmformat__} eq 'prob') {
! $wordprobstr = sprintf("%12.4f", $wordprobs{$bucket,$word});
! } else {
! $wordprobstr = sprintf("%13.5f", exp($probability) );
! }
$self->{scores__} .= "<td><font color=\"$color\">$wordprobstr</font></td>\n<td> </td>\n";
***************
*** 1762,1765 ****
--- 1869,1874 ----
}
+ $self->load_word_matrix_();
+
return 1;
}
***************
*** 2094,2097 ****
--- 2203,2214 ----
return $self->{magnet_count__};
+ }
+
+ sub wmformat
+ {
+ my ( $self, $value ) = @_;
+
+ $self->{wmformat__} = $value if (defined $value);
+ return $self->{wmformat__};
}
|