|
From: <jgr...@us...> - 2003-04-12 21:16:59
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv1534/Classifier
Modified Files:
Bayes.pm MailParse.pm
Log Message:
Added infrastructure for QuickMagnets and made all the colorized output use words from the current language, also added false positive and false negative counting; NOTE THAT THESE ARE NOT FULLY WORKING; this check in is so that Sam and I do not diverge too much
Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.122
retrieving revision 1.123
diff -C2 -d -r1.122 -r1.123
*** Bayes.pm 12 Apr 2003 07:31:23 -0000 1.122
--- Bayes.pm 12 Apr 2003 21:16:52 -0000 1.123
***************
*** 475,478 ****
--- 475,479 ----
#
# $file The name of the file containing the text to classify
+ # $ui Reference to the UI used when doing colorization
#
# Splits the mail message into valid words, then runs the Bayes algorithm to figure out
***************
*** 482,486 ****
sub classify_file
{
! my ($self, $file) = @_;
my $msg_total = 0;
--- 483,487 ----
sub classify_file
{
! my ($self, $file, $ui) = @_;
my $msg_total = 0;
***************
*** 583,587 ****
my @ranking = sort {$score{$b} <=> $score{$a}} keys %score;
-
my %raw_score;
my $base_score = $score{$ranking[0]};
--- 584,587 ----
***************
*** 601,625 ****
}
! $self->{scores__} = "<b>Scores</b><p>\n<table class=\"top20Buckets\">\n<tr>\n<th scope=\"col\">Bucket</th>\n<th> </th>\n";
! $self->{scores__} .= "<th scope=\"col\">Probability</th></tr>\n";
! foreach my $b (@ranking) {
! my $prob = exp($score{$b})/$total;
! my $probstr;
! if ($prob >= 0.1 || $prob == 0.0) {
! $probstr = sprintf("%12.6f", $prob);
! } else {
! $probstr = sprintf("%17.6e", $prob);
! }
! $self->{scores__} .= "<tr>\n<td><font color=\"$self->{colors__}{$b}\"><b>$b</b></font></td>\n<td> </td>\n<td>$probstr</td>\n</tr>\n";
! }
! $self->{scores__} .= "</table>";
! if ($self->{wordscores__}) {
$self->{scores__} .= "<table class=\"top20Words\">\n<tr><td colspan=\"4\"> </td></tr>\n";
! $self->{scores__} .= "<tr>\n<th scope=\"col\">Word</th><th> </th><th scope=\"col\">Count</th><th> </th>\n";
foreach my $bucket (@buckets) {
--- 601,652 ----
}
! if ($self->{wordscores__} && defined($ui) ) {
! my @qm = @{$self->{parser__}->quickmagnets()};
! my %language = $ui->language();
! my $session_key = $ui->session_key();
! if ( $#qm >= 0 ) {
! $self->{scores__} = "<p><b>$language{QuickMagnets}</b><p>\n<table class=\"top20Words\">\n<tr>\n<th scope=\"col\">$language{Magnet}</th>\n<th>$language{Magnet_Always}</th>\n";
! foreach my $m (@qm) {
! $self->{scores__} .= "<tr><td scope=\"col\">$m</td><td>";
! $self->{scores__} .= "<form action=\"/magnets\">\n";
! $self->{scores__} .= "<input type=\"hidden\" name=\"session\" value=\"$session_key\" />";
! $self->{scores__} .= "<input type=\"hidden\" name=\"type\" id=\"magnetsAddType\" />";
! $self->{scores__} .= "<input type=\"hidden\" name=\"text\" id=\"magnetsAddText\" />";
! $self->{scores__} .= "<select name=\"bucket\" id=\"magnetsAddBucket\">\n<option value=\"\"></option>\n";
! my @buckets = $self->get_buckets();
! foreach my $bucket (@buckets) {
! $self->{scores__} .= "<option value=\"$bucket\">$bucket</option>\n";
! }
!
! $self->{scores__} .= "</select><input type=\"submit\" class=\"submit\" name=\"create\" value=\"$language{Create}\" /></form></td></tr>";
! }
!
! $self->{scores__} .= "</table>";
! }
!
! $self->{scores__} .= "<hr><b>$language{Scores}</b><p>\n<table class=\"top20Words\">\n<tr>\n<th scope=\"col\">$language{Bucket}</th>\n<th> </th>\n";
! $self->{scores__} .= "<th scope=\"col\">$language{Probability}</th></tr>\n";
!
! foreach my $b (@ranking) {
! my $prob = exp($score{$b})/$total;
! my $probstr;
!
! if ($prob >= 0.1 || $prob == 0.0) {
! $probstr = sprintf("%12.6f", $prob);
! } else {
! $probstr = sprintf("%17.6e", $prob);
! }
!
! $self->{scores__} .= "<tr>\n<td><font color=\"$self->{colors__}{$b}\"><b>$b</b></font></td>\n<td> </td>\n<td>$probstr</td>\n</tr>\n";
! }
!
! $self->{scores__} .= "</table><hr>";
$self->{scores__} .= "<table class=\"top20Words\">\n<tr><td colspan=\"4\"> </td></tr>\n";
! $self->{scores__} .= "<tr>\n<th scope=\"col\">$language{Word}</th><th> </th><th scope=\"col\">$language{Count}</th><th> </th>\n";
foreach my $bucket (@buckets) {
***************
*** 1082,1086 ****
$self->{parser__}->{bayes__} = bless $self;
my $result = $self->{parser__}->parse_stream($file);
! $self->{parser__}->{color__} = 0;
return $result;
--- 1109,1113 ----
$self->{parser__}->{bayes__} = bless $self;
my $result = $self->{parser__}->parse_stream($file);
! $self->{parser__}->{color__} = 0;
return $result;
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.111
retrieving revision 1.112
diff -C2 -d -r1.111 -r1.112
*** MailParse.pm 11 Apr 2003 02:25:45 -0000 1.111
--- MailParse.pm 12 Apr 2003 21:16:52 -0000 1.112
***************
*** 54,58 ****
$self->{color__} = 0;
! # This will store the from, to, cc and subject from the last parse
$self->{from__} = '';
$self->{to__} = '';
--- 54,58 ----
$self->{color__} = 0;
! # This will store the from, to, cc and subject from the last parse
$self->{from__} = '';
$self->{to__} = '';
***************
*** 60,63 ****
--- 60,69 ----
$self->{subject__} = '';
+ # This is used to store the words found in the from, to, and subject
+ # lines for use in creating new magnets, it is a list of pairs mapping
+ # a magnet type to a magnet string, e.g. from => po...@jg...
+
+ $self->{quickmagnets__} = ();
+
# These store the current HTML background color and font color to
# detect "invisible ink" used by spammers
***************
*** 174,177 ****
--- 180,187 ----
$mword = $prefix . ':' . $mword if ( $prefix ne '' );
+ if ( $prefix =~ /(from|to|cc|subject)/i ) {
+ push @{$self->{quickmagnets__}}, ("$prefix: $word");
+ }
+
if ( $self->{color__} ) {
my $color = $self->{bayes__}->get_color($mword);
***************
*** 184,187 ****
--- 194,198 ----
$self->{ut__} .= "<font color=\"$color\">$word<\/font> ";
}
+
} else {
increment_word( $self, $mword );
***************
*** 785,805 ****
$self->{base64__} = '';
!
# Variable to note that the temporary colorized storage is "frozen",
# and what type of freeze it is (allows nesting of reasons to freeze
# colorization)
!
$self->{in_html_tag__} = 0;
!
$self->{html_tag__} = '';
$self->{html_arg__} = '';
! $self->{words__} = {};
! $self->{msg_total__} = 0;
! $self->{from__} = '';
! $self->{to__} = '';
! $self->{cc__} = '';
! $self->{subject__} = '';
! $self->{ut__} = '';
$self->{htmlbackcolor__} = map_color( $self, 'white' );
--- 796,817 ----
$self->{base64__} = '';
!
# Variable to note that the temporary colorized storage is "frozen",
# and what type of freeze it is (allows nesting of reasons to freeze
# colorization)
!
$self->{in_html_tag__} = 0;
!
$self->{html_tag__} = '';
$self->{html_arg__} = '';
! $self->{words__} = {};
! $self->{msg_total__} = 0;
! $self->{from__} = '';
! $self->{to__} = '';
! $self->{cc__} = '';
! $self->{subject__} = '';
! $self->{ut__} = '';
! $self->{quickmagnets__} = ();
$self->{htmlbackcolor__} = map_color( $self, 'white' );
***************
*** 1196,1200 ****
if ( $header =~ /^Content-Type$/i ) {
-
if ( $argument =~ /charset=\"?([^\"]{1,40})\"?/ ) {
update_word( $self, $1, 0, '' , '', 'charset' );
--- 1208,1211 ----
***************
*** 1205,1210 ****
$self->{content_type__} = $1;
}
!
! if ( $argument =~ /multipart\//i ) {
my $boundary = $argument;
--- 1216,1221 ----
$self->{content_type__} = $1;
}
!
! if ( $argument =~ /multipart\//i ) {
my $boundary = $argument;
***************
*** 1293,1297 ****
--- 1304,1313 ----
}
+ sub quickmagnets
+ {
+ my ( $self ) = @_;
+ return $self->{quickmagnets__};
+ }
1;
|