Thread: [popfile-commit] engine/Classifier Bayes.pm,1.119,1.120 MailParse.pm,1.103,1.104 WordMangle.pm,1.21,

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv11970/Classifier

Modified Files:
	Bayes.pm MailParse.pm WordMangle.pm 
Log Message:
Implement the Advanced and Buckets pages in the UI; bucket page is now indexed and sorted by word scores; add new stopword APIs to Bayes; remove hard coded English APIs; port Windows code so that we now have a working popup menu and tray icon

Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.119
retrieving revision 1.120
diff -C2 -d -r1.119 -r1.120
*** Bayes.pm	26 Mar 2003 03:58:29 -0000	1.119
--- Bayes.pm	7 Apr 2003 17:55:44 -0000	1.120
***************
*** 431,435 ****

              if ( /([^\s]+) (\d+)/ ) {
!                 my $word = $self->{mangler__}->mangle($1,1);
                  my $value = $2;
                  $value =~ s/[\r\n]//g;
--- 431,435 ----

              if ( /([^\s]+) (\d+)/ ) {
!                 my $word  = $1;
                  my $value = $2;
                  $value =~ s/[\r\n]//g;
***************
*** 933,936 ****
--- 933,955 ----
      return $self->{total__}{$bucket};
  }
+ 
+ # ---------------------------------------------------------------------------------------------
+ #
+ # get_bucket_word_list
+ #
+ # Returns a list of bucket entries, each entry corresponds to all the words with the
+ # same leading character
+ #
+ # $bucket      The name of the bucket for which the word count is desired
+ #
+ # ---------------------------------------------------------------------------------------------
+ 
+ sub get_bucket_word_list
+ {
+     my ( $self, $bucket ) = @_;
+ 
+     return $self->{matrix__}{$bucket};
+ }
+ 
  # ---------------------------------------------------------------------------------------------
  #
***************
*** 1361,1364 ****
--- 1380,1424 ----
      delete $self->{magnets__}{$bucket}{$type}{$text};
      $self->save_magnets__();
+ }
+ 
+ # ---------------------------------------------------------------------------------------------
+ #
+ # get_stop_word_list
+ #
+ # Gets the complete list of stop words
+ #
+ # ---------------------------------------------------------------------------------------------
+ 
+ sub get_stop_word_list
+ {
+     my ( $self ) = @_;
+ 
+     return $self->{parser__}->{mangle__}->stopwords();
+ }
+ 
+ # ---------------------------------------------------------------------------------------------
+ #
+ # add_stopword, remove_stopword
+ #
+ # Adds or removes a stop word
+ #
+ # $stopword    The word to add or remove
+ #
+ # Return 0 for a bad stop word, and 1 otherwise
+ #
+ # ---------------------------------------------------------------------------------------------
+ 
+ sub add_stopword
+ {
+     my ( $self, $stopword ) = @_;
+ 
+     return $self->{parser__}->{mangle__}->add_stopword( $stopword );
+ }
+ 
+ sub remove_stopword
+ {
+     my ( $self, $stopword ) = @_;
+ 
+     return $self->{parser__}->{mangle__}->remove_stopword( $stopword );
  }

Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.103
retrieving revision 1.104
diff -C2 -d -r1.103 -r1.104
*** MailParse.pm	30 Mar 2003 05:21:05 -0000	1.103
--- MailParse.pm	7 Apr 2003 17:55:46 -0000	1.104
***************
*** 133,136 ****
--- 133,155 ----
  # ---------------------------------------------------------------------------------------------
  #
+ # update_pseudoword
+ #
+ # Updates the word frequency for a pseudoword, note that this differs from update_word
+ # because it does no word mangling
+ #
+ # $prefix       The pseudoword prefix (e.g. header)
+ # $word         The pseudoword (e.g. Mime-Version)
+ #
+ # ---------------------------------------------------------------------------------------------
+ 
+ sub update_pseudoword
+ {
+     my ( $self, $prefix, $word ) = @_;
+ 
+     $self->increment_word( "$prefix:$word" );
+ }
+ 
+ # ---------------------------------------------------------------------------------------------
+ #
  # update_word
  #
***************
*** 436,439 ****
--- 455,463 ----
          }

+         # Font sizes
+         if ( ( $attribute =~ /^size$/i ) && ( $tag =~ /^font$/i ) ) {
+             $self->update_pseudoword( 'html', "fontsize$value" );
+         }
+ 
          # Tags with background colors

***************
*** 742,749 ****

      my $encoding = '';
!     
      # Variables to save header information to while parsing headers
!     
!     my $header;    
      my $argument;

--- 766,773 ----

      my $encoding = '';
! 
      # Variables to save header information to while parsing headers
! 
!     my $header;
      my $argument;

***************
*** 821,825 ****

                  $splitline =~ s/\t/&nbsp;&nbsp;&nbsp;&nbsp;/g;
!                     
                  $self->{ut__} .= $splitline;
              }
--- 845,849 ----

                  $splitline =~ s/\t/&nbsp;&nbsp;&nbsp;&nbsp;/g;
! 
                  $self->{ut__} .= $splitline;
              }
***************
*** 830,863 ****

                  if ( $line =~ /^(\r\n|\r|\n)/) {
!                     
                       # Parse the last header
!                                          
                      ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding);
!                     
                      # Clear the saved headers
                      $header   = '';
!                     $argument = '';          
!                                  
                      $self->{in_headers__} = 0;
                      print "Header parsing complete.\n" if $self->{debug};
!                 }                

                  # If we have an email header then just keep the part after the :

                  if ( $line =~ /^([A-Za-z-]+):[ \t]*([^\n\r]*)/ )  {
!                     
                      # Parse the last header
!                     
                      ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding);
!                     
                      # Save the new information for the current header
!                     
                      $header   = $1;
!                     $argument = $2;                
                  }
!                 
                  # Append to argument if the next line begins with whitespace (isn't a new header)
!                 
!                 if ( $line =~ /^[\t ](.*?)(\r\n|\r|\n)/ ) {                    
                      $argument .= $1;
                  }
--- 854,887 ----

                  if ( $line =~ /^(\r\n|\r|\n)/) {
! 
                       # Parse the last header
! 
                      ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding);
! 
                      # Clear the saved headers
                      $header   = '';
!                     $argument = '';
! 
                      $self->{in_headers__} = 0;
                      print "Header parsing complete.\n" if $self->{debug};
!                 }

                  # If we have an email header then just keep the part after the :

                  if ( $line =~ /^([A-Za-z-]+):[ \t]*([^\n\r]*)/ )  {
! 
                      # Parse the last header
! 
                      ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding);
! 
                      # Save the new information for the current header
! 
                      $header   = $1;
!                     $argument = $2;
                  }
! 
                  # Append to argument if the next line begins with whitespace (isn't a new header)
! 
!                 if ( $line =~ /^[\t ](.*?)(\r\n|\r|\n)/ ) {
                      $argument .= $1;
                  }
***************
*** 1078,1081 ****
--- 1102,1112 ----

      print "Header ($header) ($argument)\n" if ($self->{debug});
+ 
+     # After a discussion with Tim Peters and some looking at emails
+     # I'd received I discovered that the header names (case sensitive) are
+     # very significant in identifying different types of mail, for example
+     # much spam uses MIME-Version, MiME-Version and Mime-Version
+ 
+     $self->update_pseudoword( 'header', $header );

      # Handle the From, To and Cc headers and extract email addresses

Index: WordMangle.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/WordMangle.pm,v
retrieving revision 1.21
retrieving revision 1.22
diff -C2 -d -r1.21 -r1.22
*** WordMangle.pm	3 Mar 2003 15:21:41 -0000	1.21
--- WordMangle.pm	7 Apr 2003 17:55:48 -0000	1.22
***************
*** 24,47 ****
      my $self;

!     $self->{stop__} = {
!           'all', 1, 'also', 1, 'and', 1, 'any', 1, 'are', 1, 'ask', 1, 'but', 1, 'can', 1, 'com', 1, 'did', 1, 'edu', 1, 'etc', 1, 'for', 1, 'from', 1, 'had', 1, 'has', 1,
!           'have', 1, 'her', 1, 'him', 1, 'his', 1, 'inc', 1, 'its', 1, 'it\'s', 1, 'ltd', 1, 'may', 1, 'not', 1, 'off', 1, 'our', 1, 'out', 1, 'she', 1, 'the', 1, 'this',
!           1, 'yes', 1, 'yet', 1, 'you', 1, 'http', 1, 'https', 1, 'mailto', 1, 'com',  1, 'with',  1, 'your',  1, 'that',  1, 'org',  1, 'cgi',  1, 'net',  1, 'www',  1,
!           'src',  1, 'smtp', 1, 'nbsp', 1, 'esmtp', 1, 'align', 1, 'valign', 1, 'width', 1, 'height', 1, 'border', 1, 'abbrev', 1, 'acronym', 1, 'address', 1, 'applet', 1,
!           'area', 1, 'author', 1, 'banner', 1, 'base', 1, 'basefont', 1, 'bgsound', 1, 'big', 1, 'blink', 1, 'blockquote', 1, 'body', 1, 'caption', 1, 'center', 1, 'cite',
!           1, 'code', 1, 'col', 1, 'colgroup', 1, 'del', 1, 'dfn', 1, 'dir', 1, 'div', 1, 'embed', 1, 'fig', 1, 'font', 1, 'form', 1, 'frame', 1, 'frameset', 1, 'head', 1,
!           'html', 1, 'iframe', 1, 'img', 1, 'input', 1, 'ins', 1, 'isindex', 1, 'kbd', 1, 'lang', 1, 'link', 1, 'listing', 1, 'map', 1, 'marquee', 1, 'math', 1, 'menu', 1,
!           'meta', 1, 'multicol', 1, 'nobr', 1, 'noframes', 1, 'note', 1, 'overlay', 1, 'param', 1, 'person', 1, 'plaintext', 1, 'pre', 1, 'range', 1, 'samp', 1, 'script',
!           1, 'select', 1, 'small', 1, 'spacer', 1, 'spot', 1, 'strike', 1, 'strong', 1, 'sub', 1, 'sup', 1, 'tab', 1, 'table', 1, 'tbody', 1, 'textarea', 1, 'textflow', 1,
!           'tfoot', 1, 'thead', 1, 'title', 1, 'var', 1, 'wbr', 1, 'xmp', 1, 'mon', 1, 'tue', 1, 'wed', 1, 'thu', 1, 'fri', 1, 'sat', 1, 'sun', 1, 'jan', 1, 'feb', 1,
!           'mar', 1, 'apr', 1, 'may', 1, 'jun', 1, 'jul', 1, 'aug', 1, 'sep', 1, 'oct', 1, 'nov', 1, 'dec', 1, 'est', 1, 'edt', 1, 'cst', 1, 'cdt', 1, 'pdt', 1, 'pst', 1,
!           'gmt', 1, 'subject', 1, 'date', 1, 'localhost', 1, 'received', 1, 'helo', 1, 'charset', 1, 'encoding', 1, 'htm', 1, 'mail', 1, 'alt', 1, 'cellspacing', 1,
!           'bgcolor', 1, 'serif', 1, 'sans', 1, 'helvetica', 1, 'color', 1, 'message', 1, 'path', 1, 'return', 1, 'span', 1, 'mbox', 1, 'status', 1, 'been', 1, 'being', 1,
!           'was', 1, 'were', 1, 'does', 1, 'doing', 1, 'done', 1, 'having', 1, 'goes', 1, 'going', 1, 'gone', 1, 'went', 1, 'could', 1, 'will', 1, 'would', 1,
!          };

!     load_stop_words($self);

!     return bless $self, $type;
  }

--- 24,34 ----
      my $self;

!     $self->{stop__} = {};

!     bless $self, $type;

!     $self->load_stop_words();
! 
!     return $self;
  }

***************
*** 93,100 ****
  #               of the mail header
  #
  # ---------------------------------------------------------------------------------------------
  sub mangle
  {
!     my ($self, $word, $allow_colon) = @_;

      # All words are treated as lowercase
--- 80,89 ----
  #               of the mail header
  #
+ # $ignore_stops If defined ignores the stop word list
+ #
  # ---------------------------------------------------------------------------------------------
  sub mangle
  {
!     my ($self, $word, $allow_colon, $ignore_stops) = @_;

      # All words are treated as lowercase
***************
*** 104,108 ****
      # Stop words are ignored

!     return '' if ( $self->{stop__}{$word} );

      # Remove characters that would mess up a Perl regexp and replace with .
--- 93,97 ----
      # Stop words are ignored

!     return '' if ( ( $self->{stop__}{$word} ) && ( !defined( $ignore_stops ) ) );

      # Remove characters that would mess up a Perl regexp and replace with .
***************
*** 126,129 ****
--- 115,174 ----

      return $word;
+ }
+ 
+ # ---------------------------------------------------------------------------------------------
+ #
+ # add_stopword, remove_stopword
+ #
+ # Adds or removes a stop word
+ #
+ # $stopword    The word to add or remove
+ #
+ # Returns 1 if successful, or 0 for a bad stop word
+ # ---------------------------------------------------------------------------------------------
+ 
+ sub add_stopword
+ {
+     my ( $self, $stopword ) = @_;
+ 
+     $stopword = $self->mangle( $stopword, 0, 1 );
+ 
+     if ( $stopword ne '' ) {
+         $self->{stop__}{$stopword} = 1;
+         $self->save_stop_words();
+ 
+        return 1;
+     }
+ 
+     return 0;
+ }
+ 
+ sub remove_stopword
+ {
+     my ( $self, $stopword ) = @_;
+ 
+     $stopword = $self->mangle( $stopword, 0, 1 );
+ 
+     if ( $stopword ne '' ) {
+         delete $self->{stop__}{$stopword};
+         $self->save_stop_words();
+ 
+         return 1;
+     }
+ 
+     return 0;
+ }
+ 
+ # GETTER/SETTERS
+ 
+ sub stopwords
+ {
+     my ( $self, $value ) = @_;
+ 
+     if ( defined( $value ) ) {
+         $self->{stop__} = $value;
+     }
+ 
+     return $self->{stop__};
  }