|
From: <jgr...@us...> - 2003-10-13 20:23:48
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv28571/Classifier
Modified Files:
Bayes.pm WordMangle.pm
Log Message:
FINAL PREPARATIONS FOR V0.20.0
Bring test suite to as close to 100% coverage as possible
Clean up Japanese/Korean code for better maintainability
Fix problem with random test suite crashes
Proxy/POP3.pm:
Add code to upgrade the welcome_string if the user has not
changed it from the default in a previous version of POPFile.
Classifier/Bayes.pm:
Remove a lot of code that was duplicated because of
the addition of Japanese and Korean support. To do
so created two helper methods: add_words_to_bucket__
and magnet_match__. There's still some duplicated code
(e.g. calls to these functions) but I can't find a good
way to deal with 'no locale' other than this.
Add hints for the code coverage about blocks of code.
Change the word scores code so that the scores are not
calculated if not required.
UI/HTML.pm:
Handle the display and non-display of the word matrix,
remove the stickiness of the word table format since
POPFile has not other 'sticky' values at this point.
Remove duplicated code introduced by Japanese and
Korean support.
tests.pl:
Accept multiple patterns on the command line so that
multiple tests can be specified at once. Patterns are
separated by commas. e.g. to run the HTTP and MailParse
test suites do:
gmake test TESTARGS=HTTP,MailParse
license:
Incorporate information about the BerkeleyDB license.
tests/TestWordMangle.tst:
Add tests for Japanese stop word support.
tests/TestModule.tst:
Add calls to dummy parent methods that do nothing, done
to get coverage to 100%.
tests/TestMailParse.tst
tests/TestPOP3.tst
tests/TestHTML.tst:
Make sure to stop the Bayes module to close the database.
tests/TestHTML.script:
Add tests for the new Single Message View where the word
matrix is not expanded by default. Test with and without
the word matrix and test the different views.
Added tests for Japanese and Korean stop words.
tests/TestProxy.tst:
Add tests for echo_response_'s handling of timeouts.
tests/languages/Korean.msg
tests/languages/Nihongo.msg:
Added the Korean and Japanese language files.
tests/TestMailParse022.cam
tests/TestMailParse022.wrd
tests/TestMailParse022.msg:
Split the From and Subject line to check the new
long header support.
tests/TestMailParse019.clr
tests/TestMailParse015.clr:
The information in this appeared to be wrong, so updated
to the latest output.
tests/TestMailParse.tst:
Add TestMailParse019 to the colorization tests.
Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.208
retrieving revision 1.209
diff -C2 -d -r1.208 -r1.209
*** Bayes.pm 10 Oct 2003 14:59:32 -0000 1.208
--- Bayes.pm 13 Oct 2003 20:23:40 -0000 1.209
***************
*** 73,77 ****
# Choice for the format of the "word matrix" display.
! $self->{wmformat__} = 'prob';
# Just our hostname
--- 73,77 ----
# Choice for the format of the "word matrix" display.
! $self->{wmformat__} = '';
# Just our hostname
***************
*** 356,361 ****
if ( defined( $value ) ) {
! my $total = $self->get_bucket_word_count( $bucket );
! return log( $value / $total );
} else {
return 0;
--- 356,367 ----
if ( defined( $value ) ) {
!
! # Profiling notes:
! #
! # I tried caching the log of the total value and then doing
! # log( $value ) - $cached and this turned out to be
! # much slower than this single log with a division in it
!
! return log( $value / $self->{matrix__}{$bucket}{__POPFILE__TOTAL__} );
} else {
return 0;
***************
*** 407,415 ****
my $total = $self->get_bucket_word_count( $bucket );
! $total -= $oldvalue;
! $self->{full_total__} -= $oldvalue;
! $self->{matrix__}{$bucket}{$word} = $value;
! $total += $value;
! $self->{matrix__}{$bucket}{__POPFILE__TOTAL__} = $total;
$self->{full_total__} += $value;
--- 413,421 ----
my $total = $self->get_bucket_word_count( $bucket );
! $total -= $oldvalue;
! $self->{full_total__} -= $oldvalue;
! $self->{matrix__}{$bucket}{$word} = $value;
! $total += $value;
! $self->{matrix__}{$bucket}{__POPFILE__TOTAL__} = $total;
$self->{full_total__} += $value;
***************
*** 571,576 ****
if ( !defined( $self->{matrix__}{$bucket}{__POPFILE__TOTAL__} ) ) {
! $self->{matrix__}{$bucket}{__POPFILE__TOTAL__} = 0;
! $self->{matrix__}{$bucket}{__POPFILE__UNIQUE__} = 0;
}
}
--- 577,582 ----
if ( !defined( $self->{matrix__}{$bucket}{__POPFILE__TOTAL__} ) ) {
! $self->{matrix__}{$bucket}{__POPFILE__TOTAL__} = 0;
! $self->{matrix__}{$bucket}{__POPFILE__UNIQUE__} = 0;
}
}
***************
*** 847,850 ****
--- 853,892 ----
}
+
+ # ---------------------------------------------------------------------------------------------
+ #
+ # magnet_match__
+ #
+ # Helper the determines if a specific string matches a certain magnet type in a bucket
+ #
+ # $noattype The string to match
+ # $bucket The bucket to check
+ # $type The magnet type to check
+ #
+ # ---------------------------------------------------------------------------------------------
+
+ sub magnet_match__
+ {
+ my ( $self, $noattype, $bucket, $type ) = @_;
+
+ for my $magnet (sort keys %{$self->{magnets__}{$bucket}{$type}}) {
+ my $regex;
+
+ $regex = $magnet;
+ $regex =~ s/@/__POPFILE_AT__/g;
+ $regex =~ s/\$/__POPFILE_DOLLAR__/g;
+
+ if ( $noattype =~ m/\Q$regex\E/i ) {
+ $self->{scores__} = '';
+ $self->{magnet_used__} = 1;
+ $self->{magnet_detail__} = "$type: $magnet";
+
+ return 1;
+ }
+ }
+
+ return 0;
+ }
+
# ---------------------------------------------------------------------------------------------
#
***************
*** 898,935 ****
# Disable the locale in Korean mode, too.
! if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ||
! $self->module_config_( 'html', 'language' ) eq 'Korean' ) {
no locale;
! for my $magnet (sort keys %{$self->{magnets__}{$bucket}{$type}}) {
! my $regex;
!
! $regex = $magnet;
! $regex =~ s/@/__POPFILE_AT__/g;
! $regex =~ s/\$/__POPFILE_DOLLAR__/g;
!
! if ( $noattype =~ m/\Q$regex\E/i ) {
! $self->{scores__} = '';
! $self->{magnet_used__} = 1;
! $self->{magnet_detail__} = "$type: $magnet";
!
! return $bucket;
! }
! }
} else {
! for my $magnet (sort keys %{$self->{magnets__}{$bucket}{$type}}) {
! my $regex;
!
! $regex = $magnet;
! $regex =~ s/@/__POPFILE_AT__/g;
! $regex =~ s/\$/__POPFILE_DOLLAR__/g;
!
! if ( $noattype =~ m/\Q$regex\E/i ) {
! $self->{scores__} = '';
! $self->{magnet_used__} = 1;
! $self->{magnet_detail__} = "$type: $magnet";
!
! return $bucket;
! }
! }
}
}
--- 940,948 ----
# Disable the locale in Korean mode, too.
! if ( $self->module_config_( 'html', 'language' ) =~ /^Nihongo|Korean$/ ) {
no locale;
! return $bucket if ( $self->magnet_match__( $noattype, $bucket, $type ) );
} else {
! return $bucket if ( $self->magnet_match__( $noattype, $bucket, $type ) );
}
}
***************
*** 946,950 ****
my %score;
my %matchcount;
- my %chi;
for my $bucket (@buckets) {
--- 959,962 ----
***************
*** 1070,1074 ****
$self->{scores__} .= "<th scope=\"col\">$language{Count} </th><th scope=\"col\" align=\"center\">$language{Score}</th><th scope=\"col\">$language{Probability}</th></tr>\n";
} else {
! $self->{scores__} .= "<th scope=\"col\">$language{Count} </th><th scope=\"col\">$language{Probability}</th></tr>\n";
}
--- 1082,1086 ----
$self->{scores__} .= "<th scope=\"col\">$language{Count} </th><th scope=\"col\" align=\"center\">$language{Score}</th><th scope=\"col\">$language{Probability}</th></tr>\n";
} else {
! $self->{scores__} .= "<th scope=\"col\">$language{Count} </th><th scope=\"col\">$language{Probability}</th></tr>\n";
}
***************
*** 1092,1101 ****
}
! if ($self->{wmformat__} eq 'score') {
$rawstr = sprintf("%12.6f", ($raw_score{$b} - $correction)/$log10);
$self->{scores__} .= "<tr>\n<td><font color=\"$self->{colors__}{$b}\"><b>$b</b></font></td>\n<td> </td>\n<td align=\"right\">$matchcount{$b} </td>\n<td align=right>$rawstr </td>\n<td>$probstr</td>\n</tr>\n";
} else {
! $self->{scores__} .= "<tr>\n<td><font color=\"$self->{colors__}{$b}\"><b>$b</b></font></td>\n<td> </td>\n<td align=\"right\">$matchcount{$b} </td>\n<td>$probstr</td>\n</tr>\n";
! }
}
--- 1104,1113 ----
}
! if ($self->{wmformat__} eq 'score') {
$rawstr = sprintf("%12.6f", ($raw_score{$b} - $correction)/$log10);
$self->{scores__} .= "<tr>\n<td><font color=\"$self->{colors__}{$b}\"><b>$b</b></font></td>\n<td> </td>\n<td align=\"right\">$matchcount{$b} </td>\n<td align=right>$rawstr </td>\n<td>$probstr</td>\n</tr>\n";
} else {
! $self->{scores__} .= "<tr>\n<td><font color=\"$self->{colors__}{$b}\"><b>$b</b></font></td>\n<td> </td>\n<td align=\"right\">$matchcount{$b} </td>\n<td>$probstr</td>\n</tr>\n";
! }
}
***************
*** 1107,1195 ****
$self->{scores__} .= "<!--format--><p>";
! $self->{scores__} .= "<table class=\"top20Words\">\n";
! $self->{scores__} .= "<tr>\n<th scope=\"col\">$language{Word}</th><th> </th><th scope=\"col\">$language{Count}</th><th> </th>\n";
! foreach my $ix (0..($#buckets > 7? 7: $#buckets)) {
! my $bucket = $ranking[$ix];
! my $bucketcolor = $self->get_bucket_color( $bucket );
! $self->{scores__} .= "<th><font color=\"$bucketcolor\">$bucket</font></th><th> </th>";
! }
! $self->{scores__} .= "</tr>";
! my %wordprobs;
! # If the word matrix is supposed to show probabilities, compute them,
! # saving the results in %wordprobs.
! if ( $self->{wmformat__} eq 'prob') {
! foreach my $word (keys %{$self->{parser__}->{words__}}) {
! my $sumfreq = 0;
! my %wval;
! foreach my $bucket (@ranking) {
! $wval{$bucket} = exp(get_sort_value_( $self, $bucket, $word ));
! $sumfreq += $wval{$bucket};
! }
! foreach my $bucket (@ranking) {
! $wordprobs{$bucket,$word} = $wval{$bucket} / $sumfreq;
}
}
- }
! my @ranked_words;
! if ($self->{wmformat__} eq 'prob') {
! @ranked_words = sort {$wordprobs{$ranking[0],$b} <=> $wordprobs{$ranking[0],$a}} keys %{$self->{parser__}->{words__}};
! } else {
! @ranked_words = sort {$self->get_sort_value_( $ranking[0], $b ) <=> $self->get_sort_value_( $ranking[0], $a )} keys %{$self->{parser__}->{words__}};
! }
! foreach my $word (@ranked_words) {
! my $known = 0;
! foreach my $bucket (@ranking) {
! if ( $self->get_value_( $bucket, $word ) != 0 ) {
! $known = 1;
! last;
}
- }
! if ( $known == 1 ) {
! my $wordcolor = $self->get_color( $word );
! my $count = $self->{parser__}->{words__}{$word};
! $self->{scores__} .= "<tr>\n<td><font color=\"$wordcolor\">$word</font></td><td> </td><td>$count</td><td> </td>\n";
! my $base_probability = $self->get_value_( $ranking[0], $word );
! foreach my $ix (0..($#buckets > 7? 7: $#buckets)) {
! my $bucket = $ranking[$ix];
! my $probability = $self->get_value_( $bucket, $word );
! my $color = 'black';
! if ( $probability >= $base_probability || $base_probability == 0 ) {
! $color = $self->get_bucket_color( $bucket );
! }
! if ( $probability != 0 ) {
! my $wordprobstr;
! if ($self->{wmformat__} eq 'score') {
! $wordprobstr = sprintf("%12.4f", ($probability - $self->{not_likely__})/$log10 );
! } elsif ($self->{wmformat__} eq 'prob') {
! $wordprobstr = sprintf("%12.4f", $wordprobs{$bucket,$word});
} else {
! $wordprobstr = sprintf("%13.5f", exp($probability) );
}
-
- $self->{scores__} .= "<td><font color=\"$color\">$wordprobstr</font></td>\n<td> </td>\n";
- } else {
- $self->{scores__} .= "<td> </td>\n<td> </td>\n";
}
}
}
! $self->{scores__} .= "</tr>";
}
-
- $self->{scores__} .= "</table></p>";
}
--- 1119,1209 ----
$self->{scores__} .= "<!--format--><p>";
! if ( $self->{wmformat__} ne '' ) {
! $self->{scores__} .= "<table class=\"top20Words\">\n";
! $self->{scores__} .= "<tr>\n<th scope=\"col\">$language{Word}</th><th> </th><th scope=\"col\">$language{Count}</th><th> </th>\n";
! foreach my $ix (0..($#buckets > 7? 7: $#buckets)) {
! my $bucket = $ranking[$ix];
! my $bucketcolor = $self->get_bucket_color( $bucket );
! $self->{scores__} .= "<th><font color=\"$bucketcolor\">$bucket</font></th><th> </th>";
! }
! $self->{scores__} .= "</tr>";
! my %wordprobs;
! # If the word matrix is supposed to show probabilities, compute them,
! # saving the results in %wordprobs.
! if ( $self->{wmformat__} eq 'prob') {
! foreach my $word (keys %{$self->{parser__}->{words__}}) {
! my $sumfreq = 0;
! my %wval;
! foreach my $bucket (@ranking) {
! $wval{$bucket} = exp(get_sort_value_( $self, $bucket, $word ));
! $sumfreq += $wval{$bucket};
! }
! foreach my $bucket (@ranking) {
! $wordprobs{$bucket,$word} = $wval{$bucket} / $sumfreq;
! }
}
}
! my @ranked_words;
! if ($self->{wmformat__} eq 'prob') {
! @ranked_words = sort {$wordprobs{$ranking[0],$b} <=> $wordprobs{$ranking[0],$a}} keys %{$self->{parser__}->{words__}};
! } else {
! @ranked_words = sort {$self->get_sort_value_( $ranking[0], $b ) <=> $self->get_sort_value_( $ranking[0], $a )} keys %{$self->{parser__}->{words__}};
! }
! foreach my $word (@ranked_words) {
! my $known = 0;
! foreach my $bucket (@ranking) {
! if ( $self->get_base_value_( $bucket, $word ) != 0 ) {
! $known = 1;
! last;
! }
}
! if ( $known == 1 ) {
! my $wordcolor = $self->get_color( $word );
! my $count = $self->{parser__}->{words__}{$word};
! $self->{scores__} .= "<tr>\n<td><font color=\"$wordcolor\">$word</font></td><td> </td><td>$count</td><td> </td>\n";
! my $base_probability = $self->get_value_( $ranking[0], $word );
! foreach my $ix (0..($#buckets > 7? 7: $#buckets)) {
! my $bucket = $ranking[$ix];
! my $probability = $self->get_value_( $bucket, $word );
! my $color = 'black';
! if ( $probability >= $base_probability || $base_probability == 0 ) {
! $color = $self->get_bucket_color( $bucket );
! }
! if ( $probability != 0 ) {
! my $wordprobstr;
! if ($self->{wmformat__} eq 'score') {
! $wordprobstr = sprintf("%12.4f", ($probability - $self->{not_likely__})/$log10 );
! } elsif ($self->{wmformat__} eq 'prob') {
! $wordprobstr = sprintf("%12.4f", $wordprobs{$bucket,$word});
! } else {
! $wordprobstr = sprintf("%13.5f", exp($probability) );
! }
!
! $self->{scores__} .= "<td><font color=\"$color\">$wordprobstr</font></td>\n<td> </td>\n";
} else {
! $self->{scores__} .= "<td> </td>\n<td> </td>\n";
}
}
}
+
+ $self->{scores__} .= "</tr>";
}
! $self->{scores__} .= "</table></p>";
}
}
***************
*** 1655,1659 ****
my ( $self, $bucket, $prefix ) = @_;
! return grep {/^$prefix/} grep {!/__POPFILE__(UNIQUE|TOTAL)__/} keys %{$self->{matrix__}{$bucket}};
}
--- 1669,1673 ----
my ( $self, $bucket, $prefix ) = @_;
! return grep {/^$prefix/} grep {!/^__POPFILE__/} keys %{$self->{matrix__}{$bucket}};
}
***************
*** 1680,1690 ****
if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
no locale;
! return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr_euc($_,0,1)} grep {!/__POPFILE__(UNIQUE|TOTAL)__/} keys %{$self->{matrix__}{$bucket}};
} else {
if ( $self->module_config_( 'html', 'language' ) eq 'Korean' ) {
no locale;
! return grep {$_ ne $prev && ($prev = $_, 1)} sort map {$_ =~ /([\x20-\x80]|$eksc)/} grep {!/__POPFILE__(UNIQUE|TOTAL)__/} keys %{$self->{matrix__}{$bucket}};
} else {
! return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr($_,0,1)} grep {!/__POPFILE__(UNIQUE|TOTAL)__/} keys %{$self->{matrix__}{$bucket}};
}
}
--- 1694,1704 ----
if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
no locale;
! return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr_euc($_,0,1)} grep {!/^__POPFILE__/} keys %{$self->{matrix__}{$bucket}};
} else {
if ( $self->module_config_( 'html', 'language' ) eq 'Korean' ) {
no locale;
! return grep {$_ ne $prev && ($prev = $_, 1)} sort map {$_ =~ /([\x20-\x80]|$eksc)/} grep {!/^__POPFILE__/} keys %{$self->{matrix__}{$bucket}};
} else {
! return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr($_,0,1)} grep {!/^__POPFILE__/} keys %{$self->{matrix__}{$bucket}};
}
}
***************
*** 1977,1980 ****
--- 1991,2015 ----
# add_messages_to_bucket
#
+ # Takes words previously parsed by the mail parser and adds/subtracts them to/from a bucket,
+ # this is a helper used by add_messages_to_bucket, remove_message_from_bucket
+ #
+ # $bucket Bucket to add to
+ # $subtract Set to -1 means subtract the words, set to 1 means add
+ #
+ # ---------------------------------------------------------------------------------------------
+ sub add_words_to_bucket__
+ {
+ my ( $self, $bucket, $subtract ) = @_;
+
+ foreach my $word (keys %{$self->{parser__}->{words__}}) {
+ $self->set_value_( $bucket, $word, $subtract * $self->{parser__}->{words__}{$word} +
+ $self->get_base_value_( $bucket, $word ) );
+ }
+ }
+
+ # ---------------------------------------------------------------------------------------------
+ #
+ # add_messages_to_bucket
+ #
# Parses mail messages and updates the statistics in the specified bucket
#
***************
*** 1999,2017 ****
$self->{parser__}->parse_file( $file, $self->module_config_( 'html', 'language' ) );
! # In Japanese mode, disable locale.
! # Sorting Japanese with "use locale" is memory and time consuming,
! # and may cause perl crash.
! # Disable the locale in Korean mode, too.
! if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ||
! $self->module_config_( 'html', 'language' ) eq 'Korean' ) {
no locale;
! foreach my $word (keys %{$self->{parser__}->{words__}}) {
! $self->set_value_( $bucket, $word, $self->{parser__}->{words__}{$word} + $self->get_base_value_( $bucket, $word ) );
! }
} else {
! foreach my $word (keys %{$self->{parser__}->{words__}}) {
! $self->set_value_( $bucket, $word, $self->{parser__}->{words__}{$word} + $self->get_base_value_( $bucket, $word ) );
! }
}
}
--- 2034,2047 ----
$self->{parser__}->parse_file( $file, $self->module_config_( 'html', 'language' ) );
! # In Japanese mode, disable locale.
! # Sorting Japanese with "use locale" is memory and time consuming,
! # and may cause perl crash.
! # Disable the locale in Korean mode, too.
! if ( $self->module_config_( 'html', 'language' ) =~ /^Nihongo|Korean$/ ) {
no locale;
! $self->add_words_to_bucket__( $bucket, 1 );
} else {
! $self->add_words_to_bucket__( $bucket, 1 );
}
}
***************
*** 2069,2082 ****
# Disable the locale in Korean mode, too.
! if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ||
! $self->module_config_( 'html', 'language' ) eq 'Korean' ) {
no locale;
! foreach my $word (keys %{$self->{parser__}->{words__}}) {
! $self->set_value_( $bucket, $word, $self->get_base_value_( $bucket, $word ) - $self->{parser__}->{words__}{$word} );
! }
} else {
! foreach my $word (keys %{$self->{parser__}->{words__}}) {
! $self->set_value_( $bucket, $word, $self->get_base_value_( $bucket, $word ) - $self->{parser__}->{words__}{$word} );
! }
}
--- 2099,2108 ----
# Disable the locale in Korean mode, too.
!
! if ( $self->module_config_( 'html', 'language' ) =~ /^Nihongo|Korean$/ ) {
no locale;
! $self->add_words_to_bucket__( $bucket, -1 );
} else {
! $self->add_words_to_bucket__( $bucket, -1 );
}
***************
*** 2195,2200 ****
}
! $self->{matrix__}{$bucket}{__POPFILE__TOTAL__} = 0;
! $self->{matrix__}{$bucket}{__POPFILE__UNIQUE__} = 0;
$self->load_word_matrix_();
--- 2221,2226 ----
}
! $self->{matrix__}{$bucket}{__POPFILE__TOTAL__} = 0;
! $self->{matrix__}{$bucket}{__POPFILE__UNIQUE__} = 0;
$self->load_word_matrix_();
***************
*** 2236,2241 ****
# Disable the locale in Korean mode, too.
! if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ||
! $self->module_config_( 'html', 'language' ) eq 'Korean' ) {
no locale;
return sort keys %{$self->{magnets__}{$bucket}{$type}};
--- 2262,2266 ----
# Disable the locale in Korean mode, too.
! if ( $self->module_config_( 'html', 'language' ) =~ /^Nihongo|Korean$/ ) {
no locale;
return sort keys %{$self->{magnets__}{$bucket}{$type}};
Index: WordMangle.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/WordMangle.pm,v
retrieving revision 1.30
retrieving revision 1.31
diff -C2 -d -r1.30 -r1.31
*** WordMangle.pm 1 Oct 2003 14:34:28 -0000 1.30
--- WordMangle.pm 13 Oct 2003 20:23:40 -0000 1.31
***************
*** 147,150 ****
--- 147,151 ----
#
# $stopword The word to add or remove
+ # $lang The current language
#
# Returns 1 if successful, or 0 for a bad stop word
|