From: <jgr...@us...> - 2003-04-12 21:16:59
|
Update of /cvsroot/popfile/engine/Classifier In directory sc8-pr-cvs1:/tmp/cvs-serv1534/Classifier Modified Files: Bayes.pm MailParse.pm Log Message: Added infrastructure for QuickMagnets and made all the colorized output use words from the current language, also added false positive and false negative counting; NOTE THAT THESE ARE NOT FULLY WORKING; this check in is so that Sam and I do not diverge too much Index: Bayes.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v retrieving revision 1.122 retrieving revision 1.123 diff -C2 -d -r1.122 -r1.123 *** Bayes.pm 12 Apr 2003 07:31:23 -0000 1.122 --- Bayes.pm 12 Apr 2003 21:16:52 -0000 1.123 *************** *** 475,478 **** --- 475,479 ---- # # $file The name of the file containing the text to classify + # $ui Reference to the UI used when doing colorization # # Splits the mail message into valid words, then runs the Bayes algorithm to figure out *************** *** 482,486 **** sub classify_file { ! my ($self, $file) = @_; my $msg_total = 0; --- 483,487 ---- sub classify_file { ! my ($self, $file, $ui) = @_; my $msg_total = 0; *************** *** 583,587 **** my @ranking = sort {$score{$b} <=> $score{$a}} keys %score; - my %raw_score; my $base_score = $score{$ranking[0]}; --- 584,587 ---- *************** *** 601,625 **** } ! $self->{scores__} = "<b>Scores</b><p>\n<table class=\"top20Buckets\">\n<tr>\n<th scope=\"col\">Bucket</th>\n<th> </th>\n"; ! $self->{scores__} .= "<th scope=\"col\">Probability</th></tr>\n"; ! foreach my $b (@ranking) { ! my $prob = exp($score{$b})/$total; ! my $probstr; ! if ($prob >= 0.1 || $prob == 0.0) { ! $probstr = sprintf("%12.6f", $prob); ! } else { ! $probstr = sprintf("%17.6e", $prob); ! } ! $self->{scores__} .= "<tr>\n<td><font color=\"$self->{colors__}{$b}\"><b>$b</b></font></td>\n<td> </td>\n<td>$probstr</td>\n</tr>\n"; ! } ! $self->{scores__} .= "</table>"; ! if ($self->{wordscores__}) { $self->{scores__} .= "<table class=\"top20Words\">\n<tr><td colspan=\"4\"> </td></tr>\n"; ! $self->{scores__} .= "<tr>\n<th scope=\"col\">Word</th><th> </th><th scope=\"col\">Count</th><th> </th>\n"; foreach my $bucket (@buckets) { --- 601,652 ---- } ! if ($self->{wordscores__} && defined($ui) ) { ! my @qm = @{$self->{parser__}->quickmagnets()}; ! my %language = $ui->language(); ! my $session_key = $ui->session_key(); ! if ( $#qm >= 0 ) { ! $self->{scores__} = "<p><b>$language{QuickMagnets}</b><p>\n<table class=\"top20Words\">\n<tr>\n<th scope=\"col\">$language{Magnet}</th>\n<th>$language{Magnet_Always}</th>\n"; ! foreach my $m (@qm) { ! $self->{scores__} .= "<tr><td scope=\"col\">$m</td><td>"; ! $self->{scores__} .= "<form action=\"/magnets\">\n"; ! $self->{scores__} .= "<input type=\"hidden\" name=\"session\" value=\"$session_key\" />"; ! $self->{scores__} .= "<input type=\"hidden\" name=\"type\" id=\"magnetsAddType\" />"; ! $self->{scores__} .= "<input type=\"hidden\" name=\"text\" id=\"magnetsAddText\" />"; ! $self->{scores__} .= "<select name=\"bucket\" id=\"magnetsAddBucket\">\n<option value=\"\"></option>\n"; ! my @buckets = $self->get_buckets(); ! foreach my $bucket (@buckets) { ! $self->{scores__} .= "<option value=\"$bucket\">$bucket</option>\n"; ! } ! ! $self->{scores__} .= "</select><input type=\"submit\" class=\"submit\" name=\"create\" value=\"$language{Create}\" /></form></td></tr>"; ! } ! ! $self->{scores__} .= "</table>"; ! } ! ! $self->{scores__} .= "<hr><b>$language{Scores}</b><p>\n<table class=\"top20Words\">\n<tr>\n<th scope=\"col\">$language{Bucket}</th>\n<th> </th>\n"; ! $self->{scores__} .= "<th scope=\"col\">$language{Probability}</th></tr>\n"; ! ! foreach my $b (@ranking) { ! my $prob = exp($score{$b})/$total; ! my $probstr; ! ! if ($prob >= 0.1 || $prob == 0.0) { ! $probstr = sprintf("%12.6f", $prob); ! } else { ! $probstr = sprintf("%17.6e", $prob); ! } ! ! $self->{scores__} .= "<tr>\n<td><font color=\"$self->{colors__}{$b}\"><b>$b</b></font></td>\n<td> </td>\n<td>$probstr</td>\n</tr>\n"; ! } ! ! $self->{scores__} .= "</table><hr>"; $self->{scores__} .= "<table class=\"top20Words\">\n<tr><td colspan=\"4\"> </td></tr>\n"; ! $self->{scores__} .= "<tr>\n<th scope=\"col\">$language{Word}</th><th> </th><th scope=\"col\">$language{Count}</th><th> </th>\n"; foreach my $bucket (@buckets) { *************** *** 1082,1086 **** $self->{parser__}->{bayes__} = bless $self; my $result = $self->{parser__}->parse_stream($file); ! $self->{parser__}->{color__} = 0; return $result; --- 1109,1113 ---- $self->{parser__}->{bayes__} = bless $self; my $result = $self->{parser__}->parse_stream($file); ! $self->{parser__}->{color__} = 0; return $result; Index: MailParse.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v retrieving revision 1.111 retrieving revision 1.112 diff -C2 -d -r1.111 -r1.112 *** MailParse.pm 11 Apr 2003 02:25:45 -0000 1.111 --- MailParse.pm 12 Apr 2003 21:16:52 -0000 1.112 *************** *** 54,58 **** $self->{color__} = 0; ! # This will store the from, to, cc and subject from the last parse $self->{from__} = ''; $self->{to__} = ''; --- 54,58 ---- $self->{color__} = 0; ! # This will store the from, to, cc and subject from the last parse $self->{from__} = ''; $self->{to__} = ''; *************** *** 60,63 **** --- 60,69 ---- $self->{subject__} = ''; + # This is used to store the words found in the from, to, and subject + # lines for use in creating new magnets, it is a list of pairs mapping + # a magnet type to a magnet string, e.g. from => po...@jg... + + $self->{quickmagnets__} = (); + # These store the current HTML background color and font color to # detect "invisible ink" used by spammers *************** *** 174,177 **** --- 180,187 ---- $mword = $prefix . ':' . $mword if ( $prefix ne '' ); + if ( $prefix =~ /(from|to|cc|subject)/i ) { + push @{$self->{quickmagnets__}}, ("$prefix: $word"); + } + if ( $self->{color__} ) { my $color = $self->{bayes__}->get_color($mword); *************** *** 184,187 **** --- 194,198 ---- $self->{ut__} .= "<font color=\"$color\">$word<\/font> "; } + } else { increment_word( $self, $mword ); *************** *** 785,805 **** $self->{base64__} = ''; ! # Variable to note that the temporary colorized storage is "frozen", # and what type of freeze it is (allows nesting of reasons to freeze # colorization) ! $self->{in_html_tag__} = 0; ! $self->{html_tag__} = ''; $self->{html_arg__} = ''; ! $self->{words__} = {}; ! $self->{msg_total__} = 0; ! $self->{from__} = ''; ! $self->{to__} = ''; ! $self->{cc__} = ''; ! $self->{subject__} = ''; ! $self->{ut__} = ''; $self->{htmlbackcolor__} = map_color( $self, 'white' ); --- 796,817 ---- $self->{base64__} = ''; ! # Variable to note that the temporary colorized storage is "frozen", # and what type of freeze it is (allows nesting of reasons to freeze # colorization) ! $self->{in_html_tag__} = 0; ! $self->{html_tag__} = ''; $self->{html_arg__} = ''; ! $self->{words__} = {}; ! $self->{msg_total__} = 0; ! $self->{from__} = ''; ! $self->{to__} = ''; ! $self->{cc__} = ''; ! $self->{subject__} = ''; ! $self->{ut__} = ''; ! $self->{quickmagnets__} = (); $self->{htmlbackcolor__} = map_color( $self, 'white' ); *************** *** 1196,1200 **** if ( $header =~ /^Content-Type$/i ) { - if ( $argument =~ /charset=\"?([^\"]{1,40})\"?/ ) { update_word( $self, $1, 0, '' , '', 'charset' ); --- 1208,1211 ---- *************** *** 1205,1210 **** $self->{content_type__} = $1; } ! ! if ( $argument =~ /multipart\//i ) { my $boundary = $argument; --- 1216,1221 ---- $self->{content_type__} = $1; } ! ! if ( $argument =~ /multipart\//i ) { my $boundary = $argument; *************** *** 1293,1297 **** --- 1304,1313 ---- } + sub quickmagnets + { + my ( $self ) = @_; + return $self->{quickmagnets__}; + } 1; |