From: <jgr...@us...> - 2003-04-07 17:56:25
|
Update of /cvsroot/popfile/engine/Classifier In directory sc8-pr-cvs1:/tmp/cvs-serv11970/Classifier Modified Files: Bayes.pm MailParse.pm WordMangle.pm Log Message: Implement the Advanced and Buckets pages in the UI; bucket page is now indexed and sorted by word scores; add new stopword APIs to Bayes; remove hard coded English APIs; port Windows code so that we now have a working popup menu and tray icon Index: Bayes.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v retrieving revision 1.119 retrieving revision 1.120 diff -C2 -d -r1.119 -r1.120 *** Bayes.pm 26 Mar 2003 03:58:29 -0000 1.119 --- Bayes.pm 7 Apr 2003 17:55:44 -0000 1.120 *************** *** 431,435 **** if ( /([^\s]+) (\d+)/ ) { ! my $word = $self->{mangler__}->mangle($1,1); my $value = $2; $value =~ s/[\r\n]//g; --- 431,435 ---- if ( /([^\s]+) (\d+)/ ) { ! my $word = $1; my $value = $2; $value =~ s/[\r\n]//g; *************** *** 933,936 **** --- 933,955 ---- return $self->{total__}{$bucket}; } + + # --------------------------------------------------------------------------------------------- + # + # get_bucket_word_list + # + # Returns a list of bucket entries, each entry corresponds to all the words with the + # same leading character + # + # $bucket The name of the bucket for which the word count is desired + # + # --------------------------------------------------------------------------------------------- + + sub get_bucket_word_list + { + my ( $self, $bucket ) = @_; + + return $self->{matrix__}{$bucket}; + } + # --------------------------------------------------------------------------------------------- # *************** *** 1361,1364 **** --- 1380,1424 ---- delete $self->{magnets__}{$bucket}{$type}{$text}; $self->save_magnets__(); + } + + # --------------------------------------------------------------------------------------------- + # + # get_stop_word_list + # + # Gets the complete list of stop words + # + # --------------------------------------------------------------------------------------------- + + sub get_stop_word_list + { + my ( $self ) = @_; + + return $self->{parser__}->{mangle__}->stopwords(); + } + + # --------------------------------------------------------------------------------------------- + # + # add_stopword, remove_stopword + # + # Adds or removes a stop word + # + # $stopword The word to add or remove + # + # Return 0 for a bad stop word, and 1 otherwise + # + # --------------------------------------------------------------------------------------------- + + sub add_stopword + { + my ( $self, $stopword ) = @_; + + return $self->{parser__}->{mangle__}->add_stopword( $stopword ); + } + + sub remove_stopword + { + my ( $self, $stopword ) = @_; + + return $self->{parser__}->{mangle__}->remove_stopword( $stopword ); } Index: MailParse.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v retrieving revision 1.103 retrieving revision 1.104 diff -C2 -d -r1.103 -r1.104 *** MailParse.pm 30 Mar 2003 05:21:05 -0000 1.103 --- MailParse.pm 7 Apr 2003 17:55:46 -0000 1.104 *************** *** 133,136 **** --- 133,155 ---- # --------------------------------------------------------------------------------------------- # + # update_pseudoword + # + # Updates the word frequency for a pseudoword, note that this differs from update_word + # because it does no word mangling + # + # $prefix The pseudoword prefix (e.g. header) + # $word The pseudoword (e.g. Mime-Version) + # + # --------------------------------------------------------------------------------------------- + + sub update_pseudoword + { + my ( $self, $prefix, $word ) = @_; + + $self->increment_word( "$prefix:$word" ); + } + + # --------------------------------------------------------------------------------------------- + # # update_word # *************** *** 436,439 **** --- 455,463 ---- } + # Font sizes + if ( ( $attribute =~ /^size$/i ) && ( $tag =~ /^font$/i ) ) { + $self->update_pseudoword( 'html', "fontsize$value" ); + } + # Tags with background colors *************** *** 742,749 **** my $encoding = ''; ! # Variables to save header information to while parsing headers ! ! my $header; my $argument; --- 766,773 ---- my $encoding = ''; ! # Variables to save header information to while parsing headers ! ! my $header; my $argument; *************** *** 821,825 **** $splitline =~ s/\t/ /g; ! $self->{ut__} .= $splitline; } --- 845,849 ---- $splitline =~ s/\t/ /g; ! $self->{ut__} .= $splitline; } *************** *** 830,863 **** if ( $line =~ /^(\r\n|\r|\n)/) { ! # Parse the last header ! ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding); ! # Clear the saved headers $header = ''; ! $argument = ''; ! $self->{in_headers__} = 0; print "Header parsing complete.\n" if $self->{debug}; ! } # If we have an email header then just keep the part after the : if ( $line =~ /^([A-Za-z-]+):[ \t]*([^\n\r]*)/ ) { ! # Parse the last header ! ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding); ! # Save the new information for the current header ! $header = $1; ! $argument = $2; } ! # Append to argument if the next line begins with whitespace (isn't a new header) ! ! if ( $line =~ /^[\t ](.*?)(\r\n|\r|\n)/ ) { $argument .= $1; } --- 854,887 ---- if ( $line =~ /^(\r\n|\r|\n)/) { ! # Parse the last header ! ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding); ! # Clear the saved headers $header = ''; ! $argument = ''; ! $self->{in_headers__} = 0; print "Header parsing complete.\n" if $self->{debug}; ! } # If we have an email header then just keep the part after the : if ( $line =~ /^([A-Za-z-]+):[ \t]*([^\n\r]*)/ ) { ! # Parse the last header ! ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding); ! # Save the new information for the current header ! $header = $1; ! $argument = $2; } ! # Append to argument if the next line begins with whitespace (isn't a new header) ! ! if ( $line =~ /^[\t ](.*?)(\r\n|\r|\n)/ ) { $argument .= $1; } *************** *** 1078,1081 **** --- 1102,1112 ---- print "Header ($header) ($argument)\n" if ($self->{debug}); + + # After a discussion with Tim Peters and some looking at emails + # I'd received I discovered that the header names (case sensitive) are + # very significant in identifying different types of mail, for example + # much spam uses MIME-Version, MiME-Version and Mime-Version + + $self->update_pseudoword( 'header', $header ); # Handle the From, To and Cc headers and extract email addresses Index: WordMangle.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/WordMangle.pm,v retrieving revision 1.21 retrieving revision 1.22 diff -C2 -d -r1.21 -r1.22 *** WordMangle.pm 3 Mar 2003 15:21:41 -0000 1.21 --- WordMangle.pm 7 Apr 2003 17:55:48 -0000 1.22 *************** *** 24,47 **** my $self; ! $self->{stop__} = { ! 'all', 1, 'also', 1, 'and', 1, 'any', 1, 'are', 1, 'ask', 1, 'but', 1, 'can', 1, 'com', 1, 'did', 1, 'edu', 1, 'etc', 1, 'for', 1, 'from', 1, 'had', 1, 'has', 1, ! 'have', 1, 'her', 1, 'him', 1, 'his', 1, 'inc', 1, 'its', 1, 'it\'s', 1, 'ltd', 1, 'may', 1, 'not', 1, 'off', 1, 'our', 1, 'out', 1, 'she', 1, 'the', 1, 'this', ! 1, 'yes', 1, 'yet', 1, 'you', 1, 'http', 1, 'https', 1, 'mailto', 1, 'com', 1, 'with', 1, 'your', 1, 'that', 1, 'org', 1, 'cgi', 1, 'net', 1, 'www', 1, ! 'src', 1, 'smtp', 1, 'nbsp', 1, 'esmtp', 1, 'align', 1, 'valign', 1, 'width', 1, 'height', 1, 'border', 1, 'abbrev', 1, 'acronym', 1, 'address', 1, 'applet', 1, ! 'area', 1, 'author', 1, 'banner', 1, 'base', 1, 'basefont', 1, 'bgsound', 1, 'big', 1, 'blink', 1, 'blockquote', 1, 'body', 1, 'caption', 1, 'center', 1, 'cite', ! 1, 'code', 1, 'col', 1, 'colgroup', 1, 'del', 1, 'dfn', 1, 'dir', 1, 'div', 1, 'embed', 1, 'fig', 1, 'font', 1, 'form', 1, 'frame', 1, 'frameset', 1, 'head', 1, ! 'html', 1, 'iframe', 1, 'img', 1, 'input', 1, 'ins', 1, 'isindex', 1, 'kbd', 1, 'lang', 1, 'link', 1, 'listing', 1, 'map', 1, 'marquee', 1, 'math', 1, 'menu', 1, ! 'meta', 1, 'multicol', 1, 'nobr', 1, 'noframes', 1, 'note', 1, 'overlay', 1, 'param', 1, 'person', 1, 'plaintext', 1, 'pre', 1, 'range', 1, 'samp', 1, 'script', ! 1, 'select', 1, 'small', 1, 'spacer', 1, 'spot', 1, 'strike', 1, 'strong', 1, 'sub', 1, 'sup', 1, 'tab', 1, 'table', 1, 'tbody', 1, 'textarea', 1, 'textflow', 1, ! 'tfoot', 1, 'thead', 1, 'title', 1, 'var', 1, 'wbr', 1, 'xmp', 1, 'mon', 1, 'tue', 1, 'wed', 1, 'thu', 1, 'fri', 1, 'sat', 1, 'sun', 1, 'jan', 1, 'feb', 1, ! 'mar', 1, 'apr', 1, 'may', 1, 'jun', 1, 'jul', 1, 'aug', 1, 'sep', 1, 'oct', 1, 'nov', 1, 'dec', 1, 'est', 1, 'edt', 1, 'cst', 1, 'cdt', 1, 'pdt', 1, 'pst', 1, ! 'gmt', 1, 'subject', 1, 'date', 1, 'localhost', 1, 'received', 1, 'helo', 1, 'charset', 1, 'encoding', 1, 'htm', 1, 'mail', 1, 'alt', 1, 'cellspacing', 1, ! 'bgcolor', 1, 'serif', 1, 'sans', 1, 'helvetica', 1, 'color', 1, 'message', 1, 'path', 1, 'return', 1, 'span', 1, 'mbox', 1, 'status', 1, 'been', 1, 'being', 1, ! 'was', 1, 'were', 1, 'does', 1, 'doing', 1, 'done', 1, 'having', 1, 'goes', 1, 'going', 1, 'gone', 1, 'went', 1, 'could', 1, 'will', 1, 'would', 1, ! }; ! load_stop_words($self); ! return bless $self, $type; } --- 24,34 ---- my $self; ! $self->{stop__} = {}; ! bless $self, $type; ! $self->load_stop_words(); ! ! return $self; } *************** *** 93,100 **** # of the mail header # # --------------------------------------------------------------------------------------------- sub mangle { ! my ($self, $word, $allow_colon) = @_; # All words are treated as lowercase --- 80,89 ---- # of the mail header # + # $ignore_stops If defined ignores the stop word list + # # --------------------------------------------------------------------------------------------- sub mangle { ! my ($self, $word, $allow_colon, $ignore_stops) = @_; # All words are treated as lowercase *************** *** 104,108 **** # Stop words are ignored ! return '' if ( $self->{stop__}{$word} ); # Remove characters that would mess up a Perl regexp and replace with . --- 93,97 ---- # Stop words are ignored ! return '' if ( ( $self->{stop__}{$word} ) && ( !defined( $ignore_stops ) ) ); # Remove characters that would mess up a Perl regexp and replace with . *************** *** 126,129 **** --- 115,174 ---- return $word; + } + + # --------------------------------------------------------------------------------------------- + # + # add_stopword, remove_stopword + # + # Adds or removes a stop word + # + # $stopword The word to add or remove + # + # Returns 1 if successful, or 0 for a bad stop word + # --------------------------------------------------------------------------------------------- + + sub add_stopword + { + my ( $self, $stopword ) = @_; + + $stopword = $self->mangle( $stopword, 0, 1 ); + + if ( $stopword ne '' ) { + $self->{stop__}{$stopword} = 1; + $self->save_stop_words(); + + return 1; + } + + return 0; + } + + sub remove_stopword + { + my ( $self, $stopword ) = @_; + + $stopword = $self->mangle( $stopword, 0, 1 ); + + if ( $stopword ne '' ) { + delete $self->{stop__}{$stopword}; + $self->save_stop_words(); + + return 1; + } + + return 0; + } + + # GETTER/SETTERS + + sub stopwords + { + my ( $self, $value ) = @_; + + if ( defined( $value ) ) { + $self->{stop__} = $value; + } + + return $self->{stop__}; } |