From: <jgr...@us...> - 2003-03-05 21:16:12
|
Update of /cvsroot/popfile/engine/Classifier In directory sc8-pr-cvs1:/tmp/cvs-serv23158/Classifier Modified Files: Bayes.pm MailParse.pm Log Message: Statistics and reclassification now work; all configuration parameters now fixed correctly and configuration screens work; history now updates; can look up words; TODO: magnets not working and security page partly broken, need to decide whether to drop stop words or not Index: Bayes.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v retrieving revision 1.113 retrieving revision 1.114 diff -C2 -d -r1.113 -r1.114 *** Bayes.pm 5 Mar 2003 20:06:15 -0000 1.113 --- Bayes.pm 5 Mar 2003 21:15:29 -0000 1.114 *************** *** 111,121 **** # Subject modification (global setting is on) ! $self->config_( 'subject', 1 ); # Adding the X-Text-Classification on ! $self->config_( 'xtc', 1 ); # Adding the X-POPFile-Link is no ! $self->config_( 'xpl', 1 ); # No default unclassified probability --- 111,121 ---- # Subject modification (global setting is on) ! $self->global_config_( 'subject', 1 ); # Adding the X-Text-Classification on ! $self->global_config_( 'xtc', 1 ); # Adding the X-POPFile-Link is no ! $self->global_config_( 'xpl', 1 ); # No default unclassified probability *************** *** 801,805 **** # Add the Subject line modification or the original line back again if ( $classification ne 'unclassified' ) { ! if ( $self->config_( 'subject' ) ) { # Don't add the classification unless it is not present if ( !( $msg_subject =~ /\[\Q$classification\E\]/ ) && --- 801,805 ---- # Add the Subject line modification or the original line back again if ( $classification ne 'unclassified' ) { ! if ( $self->global_config_( 'subject' ) ) { # Don't add the classification unless it is not present if ( !( $msg_subject =~ /\[\Q$classification\E\]/ ) && *************** *** 819,823 **** # Add the XTC header ! $msg_head_after .= "X-Text-Classification: $classification$eol" if ( ( $self->config_( 'xtc' ) ) && ( $self->{parameters__}{$classification}{quarantine} == 0 ) ); --- 819,823 ---- # Add the XTC header ! $msg_head_after .= "X-Text-Classification: $classification$eol" if ( ( $self->global_config_( 'xtc' ) ) && ( $self->{parameters__}{$classification}{quarantine} == 0 ) ); *************** *** 828,835 **** $xpl .= "<http://"; ! $xpl .= $self->config_( 'localpop' )?"127.0.0.1":$self->{hostname__}; ! $xpl .= ":$self->config_( 'ui_port' )/jump_to_message?view=$temp_file>$eol"; ! if ( $self->config_( 'xpl' ) && ( $self->{parameters__}{$classification}{quarantine} == 0 ) ) { $msg_head_after .= 'X-POPFile-Link: ' . $xpl; } --- 828,835 ---- $xpl .= "<http://"; ! $xpl .= $self->module_config_( 'pop3', 'local' )?"127.0.0.1":$self->{hostname__}; ! $xpl .= ":" . $self->module_config_( 'ui', 'port' ) . "/jump_to_message?view=$temp_file>$eol"; ! if ( $self->global_config_( 'xpl' ) && ( $self->{parameters__}{$classification}{quarantine} == 0 ) ) { $msg_head_after .= 'X-POPFile-Link: ' . $xpl; } *************** *** 846,853 **** if ( $classification ne 'unclassified' ) { if ( $self->{parameters__}{$classification}{quarantine} == 1 ) { ! print $client "From: $self->{parser__}->{from}$eol"; ! print $client "To: $self->{parser__}->{to}$eol"; ! print $client "Date: $self->{parser__}->{date}$eol"; ! if ( $self->config_( 'subject' ) ) { # Don't add the classification unless it is not present if ( !( $msg_subject =~ /\[\Q$classification\E\]/ ) && --- 846,853 ---- if ( $classification ne 'unclassified' ) { if ( $self->{parameters__}{$classification}{quarantine} == 1 ) { ! print $client "From: " . $self->{parser__}->get_header( 'from' ) . "$eol"; ! print $client "To: " . $self->{parser__}->get_header( 'to' ) . "$eol"; ! print $client "Date: " . $self->{parser__}->get_header( 'date' ) . "$eol"; ! if ( $self->global_config_( 'subject' ) ) { # Don't add the classification unless it is not present if ( !( $msg_subject =~ /\[\Q$classification\E\]/ ) && *************** *** 857,869 **** } print $client "Subject:$msg_subject$eol"; ! print $client "X-Text-Classification: $classification$eol" if ( $self->config_( 'xtc' ) ); ! print $client 'X-POPFile-Link: ' . $xpl if ( $self->config_( 'xpl' ) ); print $client "Content-Type: multipart/report; boundary=\"$temp_file\"$eol$eol--$temp_file$eol"; print $client "Content-Type: text/plain$eol$eol"; print $client "POPFile has quarantined a message. It is attached to this email.$eol$eol"; print $client "Quarantined Message Detail$eol$eol"; ! print $client "Original From: $self->{parser__}->{from}$eol"; ! print $client "Original Subject: $self->{parser__}->{subject}$eol"; ! print $client "Original To: $self->{parser__}->{to}$eol$eol"; print $client "To examine the email open the attachment. To change this mail's classification go to $xpl$eol"; print $client "--$temp_file$eol"; --- 857,869 ---- } print $client "Subject:$msg_subject$eol"; ! print $client "X-Text-Classification: $classification$eol" if ( $self->global_config_( 'xtc' ) ); ! print $client 'X-POPFile-Link: ' . $xpl if ( $self->global_config_( 'xpl' ) ); print $client "Content-Type: multipart/report; boundary=\"$temp_file\"$eol$eol--$temp_file$eol"; print $client "Content-Type: text/plain$eol$eol"; print $client "POPFile has quarantined a message. It is attached to this email.$eol$eol"; print $client "Quarantined Message Detail$eol$eol"; ! print $client "Original From: " . $self->{parser__}->get_header('from') . "$eol"; ! print $client "Original To: " . $self->{parser__}->get_header('to') . "$eol"; ! print $client "Original Subject: " . $self->{parser__}->get_header('subject') . "$eol"; print $client "To examine the email open the attachment. To change this mail's classification go to $xpl$eol"; print $client "--$temp_file$eol"; *************** *** 963,983 **** # --------------------------------------------------------------------------------------------- # - # get_bucket_word_list - # - # Returns a list of the words in a bucket - # - # $bucket The name of the bucket for which the list is to be obtained - # - # --------------------------------------------------------------------------------------------- - - sub get_bucket_word_list - { - my ( $self, $bucket ) = @_; - - return sort keys %{$self->{matrix__}{$bucket}}; - } - - # --------------------------------------------------------------------------------------------- - # # get_bucket_color # --- 963,966 ---- *************** *** 1158,1196 **** my ( $self, $file, $bucket ) = @_; $self->{parser__}->parse_stream( $file ); foreach my $word (keys %{$self->{parser__}->{words__}}) { ! # TODO $self->set_value_( $bucket, $word, $ += $self->{parser__}->{words__}{$word}; ! $self->{total__}{$bucket} += $self->{parser__}->{words__}{$word}; } ! $self->update_constants_(); ! $self->write_parameters(); ! $self->save_bucket_( $bucket ); } # --------------------------------------------------------------------------------------------- # ! # save_bucket_ # ! # Save the current word scores in a bucket to disk # ! # $bucket Name for the bucket to save # # --------------------------------------------------------------------------------------------- ! sub save_bucket_ { ! my ( $self, $bucket ) = @_; if ( open WORDS, '>' . $self->config_( 'corpus' ) . "/$bucket/table" ) { print WORDS "__CORPUS__ __VERSION__ 1\n"; ! foreach my $word ( keys %{$self->{matrix__}{$bucket}}) { ! print WORDS "$word $self->{matrix__}{$bucket}{$word}\n"; } close WORDS; } ! $self->load_bucket_($self->config_( 'corpus' ) . "/$bucket"); } --- 1141,1243 ---- my ( $self, $file, $bucket ) = @_; + my %words; + + if ( open WORDS, '<' . $self->config_( 'corpus' ) . "/$bucket/table" ) { + while (<WORDS>) { + if ( /__CORPUS__ __VERSION__ (\d+)/ ) { + if ( $1 != $self->{corpus_version__} ) { + print "Incompatible corpus version in $bucket\n"; + return; + } + + next; + } + + if ( /([^\s]+) (\d+)/ ) { + my $word = $self->{mangler__}->mangle($1,1); + my $value = $2; + $value =~ s/[\r\n]//g; + if ( $value > 0 ) { + $words{$word} = $value; + } + } + } + + close WORDS; + } + $self->{parser__}->parse_stream( $file ); foreach my $word (keys %{$self->{parser__}->{words__}}) { ! $words{$word} += $self->{parser__}->{words__}{$word}; } ! if ( open WORDS, '>' . $self->config_( 'corpus' ) . "/$bucket/table" ) { ! print WORDS "__CORPUS__ __VERSION__ 1\n"; ! foreach my $word (sort keys %words) { ! print WORDS "$word $words{$word}\n"; ! } ! close WORDS; ! } ! ! $self->load_word_matrix_(); } # --------------------------------------------------------------------------------------------- # ! # remove_message_from_bucket # ! # Parses a mail message and updates the statistics in the specified bucket # ! # $file Name of file containing mail message to parse ! # $bucket Name of the bucket to be updated # # --------------------------------------------------------------------------------------------- ! sub remove_message_from_bucket { ! my ( $self, $file, $bucket ) = @_; ! ! my %words; ! ! if ( open WORDS, '<' . $self->config_( 'corpus' ) . "/$bucket/table" ) { ! while (<WORDS>) { ! if ( /__CORPUS__ __VERSION__ (\d+)/ ) { ! if ( $1 != $self->{corpus_version__} ) { ! print "Incompatible corpus version in $bucket\n"; ! return; ! } ! ! next; ! } ! ! if ( /([^\s]+) (\d+)/ ) { ! my $word = $self->{mangler__}->mangle($1,1); ! my $value = $2; ! $value =~ s/[\r\n]//g; ! if ( $value > 0 ) { ! $words{$word} = $value; ! } ! } ! } ! ! close WORDS; ! } ! ! $self->{parser__}->parse_stream( $file ); ! ! foreach my $word (keys %{$self->{parser__}->{words__}}) { ! $words{$word} -= $self->{parser__}->{words__}{$word}; ! } if ( open WORDS, '>' . $self->config_( 'corpus' ) . "/$bucket/table" ) { print WORDS "__CORPUS__ __VERSION__ 1\n"; ! foreach my $word (sort keys %words) { ! print WORDS "$word $words{$word}\n"; } close WORDS; } ! $self->load_word_matrix_(); } Index: MailParse.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v retrieving revision 1.93 retrieving revision 1.94 diff -C2 -d -r1.93 -r1.94 *** MailParse.pm 4 Mar 2003 21:25:19 -0000 1.93 --- MailParse.pm 5 Mar 2003 21:15:32 -0000 1.94 *************** *** 1137,1140 **** --- 1137,1153 ---- } + # --------------------------------------------------------------------------------------------- + # + # get_header - Returns the value of the from, to, subject or cc header + # + # $header Name of header to return (note must be lowercase) + # + # --------------------------------------------------------------------------------------------- + sub get_header + { + my ( $self, $header ) = @_; + + return $self->{$header . '__'}; + } 1; |