|
From: <jgr...@us...> - 2003-04-07 17:56:25
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv11970/Classifier
Modified Files:
Bayes.pm MailParse.pm WordMangle.pm
Log Message:
Implement the Advanced and Buckets pages in the UI; bucket page is now indexed and sorted by word scores; add new stopword APIs to Bayes; remove hard coded English APIs; port Windows code so that we now have a working popup menu and tray icon
Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.119
retrieving revision 1.120
diff -C2 -d -r1.119 -r1.120
*** Bayes.pm 26 Mar 2003 03:58:29 -0000 1.119
--- Bayes.pm 7 Apr 2003 17:55:44 -0000 1.120
***************
*** 431,435 ****
if ( /([^\s]+) (\d+)/ ) {
! my $word = $self->{mangler__}->mangle($1,1);
my $value = $2;
$value =~ s/[\r\n]//g;
--- 431,435 ----
if ( /([^\s]+) (\d+)/ ) {
! my $word = $1;
my $value = $2;
$value =~ s/[\r\n]//g;
***************
*** 933,936 ****
--- 933,955 ----
return $self->{total__}{$bucket};
}
+
+ # ---------------------------------------------------------------------------------------------
+ #
+ # get_bucket_word_list
+ #
+ # Returns a list of bucket entries, each entry corresponds to all the words with the
+ # same leading character
+ #
+ # $bucket The name of the bucket for which the word count is desired
+ #
+ # ---------------------------------------------------------------------------------------------
+
+ sub get_bucket_word_list
+ {
+ my ( $self, $bucket ) = @_;
+
+ return $self->{matrix__}{$bucket};
+ }
+
# ---------------------------------------------------------------------------------------------
#
***************
*** 1361,1364 ****
--- 1380,1424 ----
delete $self->{magnets__}{$bucket}{$type}{$text};
$self->save_magnets__();
+ }
+
+ # ---------------------------------------------------------------------------------------------
+ #
+ # get_stop_word_list
+ #
+ # Gets the complete list of stop words
+ #
+ # ---------------------------------------------------------------------------------------------
+
+ sub get_stop_word_list
+ {
+ my ( $self ) = @_;
+
+ return $self->{parser__}->{mangle__}->stopwords();
+ }
+
+ # ---------------------------------------------------------------------------------------------
+ #
+ # add_stopword, remove_stopword
+ #
+ # Adds or removes a stop word
+ #
+ # $stopword The word to add or remove
+ #
+ # Return 0 for a bad stop word, and 1 otherwise
+ #
+ # ---------------------------------------------------------------------------------------------
+
+ sub add_stopword
+ {
+ my ( $self, $stopword ) = @_;
+
+ return $self->{parser__}->{mangle__}->add_stopword( $stopword );
+ }
+
+ sub remove_stopword
+ {
+ my ( $self, $stopword ) = @_;
+
+ return $self->{parser__}->{mangle__}->remove_stopword( $stopword );
}
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.103
retrieving revision 1.104
diff -C2 -d -r1.103 -r1.104
*** MailParse.pm 30 Mar 2003 05:21:05 -0000 1.103
--- MailParse.pm 7 Apr 2003 17:55:46 -0000 1.104
***************
*** 133,136 ****
--- 133,155 ----
# ---------------------------------------------------------------------------------------------
#
+ # update_pseudoword
+ #
+ # Updates the word frequency for a pseudoword, note that this differs from update_word
+ # because it does no word mangling
+ #
+ # $prefix The pseudoword prefix (e.g. header)
+ # $word The pseudoword (e.g. Mime-Version)
+ #
+ # ---------------------------------------------------------------------------------------------
+
+ sub update_pseudoword
+ {
+ my ( $self, $prefix, $word ) = @_;
+
+ $self->increment_word( "$prefix:$word" );
+ }
+
+ # ---------------------------------------------------------------------------------------------
+ #
# update_word
#
***************
*** 436,439 ****
--- 455,463 ----
}
+ # Font sizes
+ if ( ( $attribute =~ /^size$/i ) && ( $tag =~ /^font$/i ) ) {
+ $self->update_pseudoword( 'html', "fontsize$value" );
+ }
+
# Tags with background colors
***************
*** 742,749 ****
my $encoding = '';
!
# Variables to save header information to while parsing headers
!
! my $header;
my $argument;
--- 766,773 ----
my $encoding = '';
!
# Variables to save header information to while parsing headers
!
! my $header;
my $argument;
***************
*** 821,825 ****
$splitline =~ s/\t/ /g;
!
$self->{ut__} .= $splitline;
}
--- 845,849 ----
$splitline =~ s/\t/ /g;
!
$self->{ut__} .= $splitline;
}
***************
*** 830,863 ****
if ( $line =~ /^(\r\n|\r|\n)/) {
!
# Parse the last header
!
($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding);
!
# Clear the saved headers
$header = '';
! $argument = '';
!
$self->{in_headers__} = 0;
print "Header parsing complete.\n" if $self->{debug};
! }
# If we have an email header then just keep the part after the :
if ( $line =~ /^([A-Za-z-]+):[ \t]*([^\n\r]*)/ ) {
!
# Parse the last header
!
($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding);
!
# Save the new information for the current header
!
$header = $1;
! $argument = $2;
}
!
# Append to argument if the next line begins with whitespace (isn't a new header)
!
! if ( $line =~ /^[\t ](.*?)(\r\n|\r|\n)/ ) {
$argument .= $1;
}
--- 854,887 ----
if ( $line =~ /^(\r\n|\r|\n)/) {
!
# Parse the last header
!
($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding);
!
# Clear the saved headers
$header = '';
! $argument = '';
!
$self->{in_headers__} = 0;
print "Header parsing complete.\n" if $self->{debug};
! }
# If we have an email header then just keep the part after the :
if ( $line =~ /^([A-Za-z-]+):[ \t]*([^\n\r]*)/ ) {
!
# Parse the last header
!
($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding);
!
# Save the new information for the current header
!
$header = $1;
! $argument = $2;
}
!
# Append to argument if the next line begins with whitespace (isn't a new header)
!
! if ( $line =~ /^[\t ](.*?)(\r\n|\r|\n)/ ) {
$argument .= $1;
}
***************
*** 1078,1081 ****
--- 1102,1112 ----
print "Header ($header) ($argument)\n" if ($self->{debug});
+
+ # After a discussion with Tim Peters and some looking at emails
+ # I'd received I discovered that the header names (case sensitive) are
+ # very significant in identifying different types of mail, for example
+ # much spam uses MIME-Version, MiME-Version and Mime-Version
+
+ $self->update_pseudoword( 'header', $header );
# Handle the From, To and Cc headers and extract email addresses
Index: WordMangle.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/WordMangle.pm,v
retrieving revision 1.21
retrieving revision 1.22
diff -C2 -d -r1.21 -r1.22
*** WordMangle.pm 3 Mar 2003 15:21:41 -0000 1.21
--- WordMangle.pm 7 Apr 2003 17:55:48 -0000 1.22
***************
*** 24,47 ****
my $self;
! $self->{stop__} = {
! 'all', 1, 'also', 1, 'and', 1, 'any', 1, 'are', 1, 'ask', 1, 'but', 1, 'can', 1, 'com', 1, 'did', 1, 'edu', 1, 'etc', 1, 'for', 1, 'from', 1, 'had', 1, 'has', 1,
! 'have', 1, 'her', 1, 'him', 1, 'his', 1, 'inc', 1, 'its', 1, 'it\'s', 1, 'ltd', 1, 'may', 1, 'not', 1, 'off', 1, 'our', 1, 'out', 1, 'she', 1, 'the', 1, 'this',
! 1, 'yes', 1, 'yet', 1, 'you', 1, 'http', 1, 'https', 1, 'mailto', 1, 'com', 1, 'with', 1, 'your', 1, 'that', 1, 'org', 1, 'cgi', 1, 'net', 1, 'www', 1,
! 'src', 1, 'smtp', 1, 'nbsp', 1, 'esmtp', 1, 'align', 1, 'valign', 1, 'width', 1, 'height', 1, 'border', 1, 'abbrev', 1, 'acronym', 1, 'address', 1, 'applet', 1,
! 'area', 1, 'author', 1, 'banner', 1, 'base', 1, 'basefont', 1, 'bgsound', 1, 'big', 1, 'blink', 1, 'blockquote', 1, 'body', 1, 'caption', 1, 'center', 1, 'cite',
! 1, 'code', 1, 'col', 1, 'colgroup', 1, 'del', 1, 'dfn', 1, 'dir', 1, 'div', 1, 'embed', 1, 'fig', 1, 'font', 1, 'form', 1, 'frame', 1, 'frameset', 1, 'head', 1,
! 'html', 1, 'iframe', 1, 'img', 1, 'input', 1, 'ins', 1, 'isindex', 1, 'kbd', 1, 'lang', 1, 'link', 1, 'listing', 1, 'map', 1, 'marquee', 1, 'math', 1, 'menu', 1,
! 'meta', 1, 'multicol', 1, 'nobr', 1, 'noframes', 1, 'note', 1, 'overlay', 1, 'param', 1, 'person', 1, 'plaintext', 1, 'pre', 1, 'range', 1, 'samp', 1, 'script',
! 1, 'select', 1, 'small', 1, 'spacer', 1, 'spot', 1, 'strike', 1, 'strong', 1, 'sub', 1, 'sup', 1, 'tab', 1, 'table', 1, 'tbody', 1, 'textarea', 1, 'textflow', 1,
! 'tfoot', 1, 'thead', 1, 'title', 1, 'var', 1, 'wbr', 1, 'xmp', 1, 'mon', 1, 'tue', 1, 'wed', 1, 'thu', 1, 'fri', 1, 'sat', 1, 'sun', 1, 'jan', 1, 'feb', 1,
! 'mar', 1, 'apr', 1, 'may', 1, 'jun', 1, 'jul', 1, 'aug', 1, 'sep', 1, 'oct', 1, 'nov', 1, 'dec', 1, 'est', 1, 'edt', 1, 'cst', 1, 'cdt', 1, 'pdt', 1, 'pst', 1,
! 'gmt', 1, 'subject', 1, 'date', 1, 'localhost', 1, 'received', 1, 'helo', 1, 'charset', 1, 'encoding', 1, 'htm', 1, 'mail', 1, 'alt', 1, 'cellspacing', 1,
! 'bgcolor', 1, 'serif', 1, 'sans', 1, 'helvetica', 1, 'color', 1, 'message', 1, 'path', 1, 'return', 1, 'span', 1, 'mbox', 1, 'status', 1, 'been', 1, 'being', 1,
! 'was', 1, 'were', 1, 'does', 1, 'doing', 1, 'done', 1, 'having', 1, 'goes', 1, 'going', 1, 'gone', 1, 'went', 1, 'could', 1, 'will', 1, 'would', 1,
! };
! load_stop_words($self);
! return bless $self, $type;
}
--- 24,34 ----
my $self;
! $self->{stop__} = {};
! bless $self, $type;
! $self->load_stop_words();
!
! return $self;
}
***************
*** 93,100 ****
# of the mail header
#
# ---------------------------------------------------------------------------------------------
sub mangle
{
! my ($self, $word, $allow_colon) = @_;
# All words are treated as lowercase
--- 80,89 ----
# of the mail header
#
+ # $ignore_stops If defined ignores the stop word list
+ #
# ---------------------------------------------------------------------------------------------
sub mangle
{
! my ($self, $word, $allow_colon, $ignore_stops) = @_;
# All words are treated as lowercase
***************
*** 104,108 ****
# Stop words are ignored
! return '' if ( $self->{stop__}{$word} );
# Remove characters that would mess up a Perl regexp and replace with .
--- 93,97 ----
# Stop words are ignored
! return '' if ( ( $self->{stop__}{$word} ) && ( !defined( $ignore_stops ) ) );
# Remove characters that would mess up a Perl regexp and replace with .
***************
*** 126,129 ****
--- 115,174 ----
return $word;
+ }
+
+ # ---------------------------------------------------------------------------------------------
+ #
+ # add_stopword, remove_stopword
+ #
+ # Adds or removes a stop word
+ #
+ # $stopword The word to add or remove
+ #
+ # Returns 1 if successful, or 0 for a bad stop word
+ # ---------------------------------------------------------------------------------------------
+
+ sub add_stopword
+ {
+ my ( $self, $stopword ) = @_;
+
+ $stopword = $self->mangle( $stopword, 0, 1 );
+
+ if ( $stopword ne '' ) {
+ $self->{stop__}{$stopword} = 1;
+ $self->save_stop_words();
+
+ return 1;
+ }
+
+ return 0;
+ }
+
+ sub remove_stopword
+ {
+ my ( $self, $stopword ) = @_;
+
+ $stopword = $self->mangle( $stopword, 0, 1 );
+
+ if ( $stopword ne '' ) {
+ delete $self->{stop__}{$stopword};
+ $self->save_stop_words();
+
+ return 1;
+ }
+
+ return 0;
+ }
+
+ # GETTER/SETTERS
+
+ sub stopwords
+ {
+ my ( $self, $value ) = @_;
+
+ if ( defined( $value ) ) {
+ $self->{stop__} = $value;
+ }
+
+ return $self->{stop__};
}
|