|
From: <jgr...@us...> - 2003-09-29 21:55:22
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv26520/Classifier
Modified Files:
Bayes.pm MailParse.pm
Log Message:
Remove chi2 and unsure code
Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.200
retrieving revision 1.201
diff -C2 -d -r1.200 -r1.201
*** Bayes.pm 22 Sep 2003 13:27:21 -0000 1.200
--- Bayes.pm 29 Sep 2003 21:54:44 -0000 1.201
***************
*** 740,771 ****
# ---------------------------------------------------------------------------------------------
#
- # chi2
- #
- # $val The value on which we do the chi2 test
- # $free Number of degrees of freedom
- # $modifier log() of a power of 10 to make values come in range
- #
- # Performs a chi-squared calculation on the passed in log(probability), liberally inspired
- # by code in SpamBayes and work by Gary Robinson
- #
- # ---------------------------------------------------------------------------------------------
- sub chi2
- {
- my ( $val, $free, $modifier ) = @_;
-
- my $m = $val + $modifier;
- my $sum = exp(-$m);
- my $term = $sum;
-
- for my $i (1..($free/2-1)) {
- $term *= $m / $i;
- $sum += $term;
- }
-
- return ($sum<1)?$sum:1;
- }
-
- # ---------------------------------------------------------------------------------------------
- #
# classify
#
--- 740,743 ----
***************
*** 915,922 ****
my $total = 0;
- foreach my $bucket (@buckets) {
- $chi{$bucket} = chi2( $score{$bucket}, $word_count, -int($score{$ranking[0]}/log(10)) * log(10) );
- }
-
# If the first and second bucket are too close in their probabilities, call the message
# unclassified. Also if there are fewer than 2 buckets.
--- 887,890 ----
***************
*** 927,945 ****
}
- # Now take a look at the top two chi tests, if they are close to each other then
- # we are unsure. If there are fewer than two buckets, the message is unclassified,
- # and there is no point to looking at the chi result.
-
- my $certainty;
- if (@buckets > 1) {
- my $c0 = 1.0 - $chi{$ranking[0]};
- my $c1 = 1.0 - $chi{$ranking[1]};
- $certainty = ($c1-$c0 + 1) / 2;
- } else {
- $certainty = 1.0;
- }
-
- $class = 'unsure' if ( $certainty < 0.4 );
-
# Compute the total of all the scores to generate the normalized scores and probability
# estimate. $total is always 1 after the first loop iteration, so any additional term
--- 895,898 ----
***************
*** 999,1009 ****
$self->{scores__} .= "<a name=\"scores\">";
- # If there are fewer than 2 buckets, there is no "verdict " to mention.
- if (@buckets > 1) {
- $self->{scores__} .= "<hr><b>$language{Scores}</b><p>\n<b>Verdict: <font color=\"$self->{colors__}{$class}\">$class ($certainty $chi{$ranking[0]} $chi{$ranking[1]})</font></b><p>\n";
- } else {
- $self->{scores__} .= "<hr><b>$language{Scores}</b><p>\n";
- }
$self->{scores__} .= "<table class=\"top20Words\">\n<tr>\n<th scope=\"col\">$language{Bucket}</th>\n<th> </th>\n";
if ($self->{wmformat__} eq 'score') {
--- 952,957 ----
$self->{scores__} .= "<a name=\"scores\">";
+ $self->{scores__} .= "<hr><b>$language{Scores}</b><p>\n";
$self->{scores__} .= "<table class=\"top20Words\">\n<tr>\n<th scope=\"col\">$language{Bucket}</th>\n<th> </th>\n";
if ($self->{wmformat__} eq 'score') {
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.163
retrieving revision 1.164
diff -C2 -d -r1.163 -r1.164
*** MailParse.pm 24 Sep 2003 15:50:58 -0000 1.163
--- MailParse.pm 29 Sep 2003 21:54:44 -0000 1.164
***************
*** 1022,1026 ****
if ( $self->{in_html_tag__} ) {
! if ( $line =~ s/^(.*?)>// ) {
$self->{html_arg__} .= $1;
$self->{in_html_tag__} = 0;
--- 1022,1026 ----
if ( $self->{in_html_tag__} ) {
! if ( $line =~ s/^([^>]*?)>// ) {
$self->{html_arg__} .= $1;
$self->{in_html_tag__} = 0;
***************
*** 1052,1056 ****
# unclosed tag
! if ( $line =~ /^<([\/]?)([^ >]+)([^>]*)$/ ) {
$self->{html_end} = ( $1 eq '/' );
$self->{html_tag__} = $2;
--- 1052,1056 ----
# unclosed tag
! if ( $line =~ /^<([\/]?)([A-Za-z][^ >]+)([^>]*)$/ ) {
$self->{html_end} = ( $1 eq '/' );
$self->{html_tag__} = $2;
|