From: <jgr...@us...> - 2003-03-21 23:10:36
|
Update of /cvsroot/popfile/engine/Classifier In directory sc8-pr-cvs1:/tmp/cvs-serv13478/Classifier Modified Files: Bayes.pm Log Message: Major alteration to the top 20 words so that it shows the full matrix of words that were used to make a decision about an email; removed the old top 20 code and all the complex calculations it did and replaced with simple dump of the matrix for that message Index: Bayes.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v retrieving revision 1.116 retrieving revision 1.117 diff -C2 -d -r1.116 -r1.117 *** Bayes.pm 6 Mar 2003 22:08:13 -0000 1.116 --- Bayes.pm 21 Mar 2003 23:10:32 -0000 1.117 *************** *** 533,539 **** my %score; - my %wordprob; - my %wtprob; - my %wbprob; for my $bucket (@buckets) { --- 533,536 ---- *************** *** 562,569 **** foreach my $word (keys %{$self->{parser__}->{words__}}) { my $wmax = -10000; - if ($self->{wordscores__}) { - $wtprob{$word} = 0; - $wbprob{$word} = {}; - } foreach my $bucket (@buckets) { --- 559,562 ---- *************** *** 577,584 **** $score{$bucket} += ( $probability * $self->{parser__}{words__}{$word} ); - if ($self->{wordscores__}) { - $wtprob{$word} += exp($probability); - $wbprob{$word}{$bucket} = exp($probability); - } } --- 570,573 ---- *************** *** 588,592 **** $correction += $wmax * $self->{parser__}{words__}{$word}; } - $wordprob{$word} = exp($wmax); } --- 577,580 ---- *************** *** 594,601 **** my @ranking = sort {$score{$b} <=> $score{$a}} keys %score; ! my @wordrank; ! if ($self->{wordscores__}) { ! @wordrank = sort {($wordprob{$b} / $wtprob{$b}) <=> ($wordprob{$a} / $wtprob{$a})} keys %wordprob; ! } my %raw_score; --- 582,586 ---- my @ranking = sort {$score{$b} <=> $score{$a}} keys %score; ! my %raw_score; *************** *** 619,622 **** --- 604,608 ---- my $prob = exp($score{$b})/$total; my $probstr; + if ($prob >= 0.1 || $prob == 0.0) { $probstr = sprintf("%12.6f", $prob); *************** *** 624,652 **** $probstr = sprintf("%17.6e", $prob); } $self->{scores__} .= "<tr>\n<td><font color=\"$self->{colors__}{$b}\"><b>$b</b></font></td>\n<td> </td>\n<td>$probstr</td>\n</tr>\n"; } $self->{scores__} .= "</table>"; if ($self->{wordscores__}) { $self->{scores__} .= "<table class=\"top20Words\">\n<tr><td colspan=\"4\"> </td></tr>\n"; ! $self->{scores__} .= "<tr>\n<th scope=\"col\">Word</th><th scope=\"col\">Prob</th>\n<th> </th>\n"; ! $self->{scores__} .= "<th scope=\"col\">\n<font color=\"$self->{colors__}{$ranking[0]}\">$ranking[0]</font>\n</th>\n</tr>\n"; ! my $wi = 0; ! foreach my $word (@wordrank) { ! if ( $wi < 20 && $wordprob{$word} / $wtprob{$word} >= 0.25 ) { ! my $wordstr = $word; ! my $long = $wordstr; ! if ( length($wordstr)>14 ) { ! $wordstr =~ /(.{12})/; ! $wordstr = "$1..."; } - my $wordcolor = get_color($self, $word); - my $wordprobstr = sprintf("%12.4f", $wordprob{$word} / $wtprob{$word}); - my $otherprobstr = sprintf("%12.4f", $wbprob{$word}{$ranking[0]} / $wtprob{$word}); - $self->{scores__} .= "<tr>\n<td><font color=\"$wordcolor\"><a title=\"$long\">$wordstr</a></font></td>\n"; - $self->{scores__} .= "<td><font color=\"$wordcolor\">$wordprobstr</font></td>\n<td> </td>\n"; - $self->{scores__} .= "<td><font color=\"$self->{colors__}{$ranking[0]}\">$otherprobstr</font></td>\n</tr>\n"; } ! $wi += 1; } --- 610,669 ---- $probstr = sprintf("%17.6e", $prob); } + $self->{scores__} .= "<tr>\n<td><font color=\"$self->{colors__}{$b}\"><b>$b</b></font></td>\n<td> </td>\n<td>$probstr</td>\n</tr>\n"; } + $self->{scores__} .= "</table>"; if ($self->{wordscores__}) { $self->{scores__} .= "<table class=\"top20Words\">\n<tr><td colspan=\"4\"> </td></tr>\n"; ! $self->{scores__} .= "<tr>\n<th scope=\"col\">Word</th><th> </th><th scope=\"col\">Count</th><th> </th>\n"; ! ! foreach my $bucket (@buckets) { ! my $bucketcolor = $self->get_bucket_color( $bucket ); ! $self->{scores__} .= "<th><font color=\"$bucketcolor\">$bucket</font></th><th> </th>"; ! } ! ! $self->{scores__} .= "</tr>"; ! ! my @ranked_words = sort {$self->get_value_( $ranking[0], $b ) <=> $self->get_value_( $ranking[0], $a )} keys %{$self->{parser__}->{words__}}; ! ! foreach my $word (@ranked_words) { ! my $known = 0; ! ! foreach my $bucket (@buckets) { ! if ( $self->get_value_( $bucket, $word ) != 0 ) { ! $known = 1; ! last; } } ! ! if ( $known == 1 ) { ! my $wordcolor = $self->get_color( $word ); ! my $count = $self->{parser__}->{words__}{$word}; ! ! $self->{scores__} .= "<tr>\n<td><font color=\"$wordcolor\">$word</font></td><td> </td><td>$count</td><td> </td>\n"; ! ! my $base_probability = $self->get_value_( $ranking[0], $word ); ! ! foreach my $bucket (@buckets) { ! my $probability = get_value_( $self, $bucket, $word ); ! my $color = 'black'; ! ! if ( $probability >= $base_probability ) { ! $color = $self->get_bucket_color( $bucket ); ! } ! ! if ( $probability != 0 ) { ! my $wordprobstr = sprintf("%12.4f", exp($probability) ); ! ! $self->{scores__} .= "<td><font color=\"$color\">$wordprobstr</font></td>\n<td> </td>\n"; ! } else { ! $self->{scores__} .= "<td> </td>\n<td> </td>\n"; ! } ! } ! } ! ! $self->{scores__} .= "</tr>"; } *************** *** 661,668 **** } - if ( $self->{wordscores__} ) { - $self->{scores__} .= "<p>(<b>$class</b>)</p>"; - } - return $class; } --- 678,681 ---- *************** *** 1042,1046 **** $self->{parser__}->{bayes__} = bless $self; my $result = $self->{parser__}->parse_stream($file); ! $self->{parser__}->{color__} = 0; return $result; --- 1055,1059 ---- $self->{parser__}->{bayes__} = bless $self; my $result = $self->{parser__}->parse_stream($file); ! $self->{parser__}->{color__} = 0; return $result; *************** *** 1065,1070 **** if ( open NEW, '>' . $self->config_( 'corpus' ) . "/$bucket/table" ) { ! print NEW "\n"; ! close NEW; } --- 1078,1083 ---- if ( open NEW, '>' . $self->config_( 'corpus' ) . "/$bucket/table" ) { ! print NEW "\n"; ! close NEW; } |