|
From: <jgr...@us...> - 2003-10-28 01:09:02
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv1240/Classifier
Modified Files:
MailParse.pm WordMangle.pm
Log Message:
Allow the specification of pseudowords as stopwords
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.170
retrieving revision 1.171
diff -C2 -d -r1.170 -r1.171
*** MailParse.pm 20 Oct 2003 21:49:11 -0000 1.170
--- MailParse.pm 28 Oct 2003 01:06:46 -0000 1.171
***************
*** 318,321 ****
--- 318,323 ----
# $literal The literal text that generated this pseudoword
#
+ # Returns 0 if the pseudoword was filtered out by a stopword
+ #
# ---------------------------------------------------------------------------------------------
sub update_pseudoword
***************
*** 323,339 ****
my ( $self, $prefix, $word, $encoded, $literal ) = @_;
! my $mword = "$prefix:$word";
! if ( $self->{color__} ) {
! if ( $encoded == 1 ) {
! $literal =~ s/</</g;
! $literal =~ s/>/>/g;
! my $color = $self->{bayes__}->get_color($mword);
! my $to = "<b><font color=\"$color\"><a title=\"$mword\">$literal</a></font></b>";
! $self->{ut__} .= $to . ' ';
}
}
! $self->increment_word( $mword );
}
--- 325,346 ----
my ( $self, $prefix, $word, $encoded, $literal ) = @_;
! my $mword = $self->{mangle__}->mangle("$prefix:$word",1);
! if ( $mword ne '' ) {
! if ( $self->{color__} ) {
! if ( $encoded == 1 ) {
! $literal =~ s/</</g;
! $literal =~ s/>/>/g;
! my $color = $self->{bayes__}->get_color($mword);
! my $to = "<b><font color=\"$color\"><a title=\"$mword\">$literal</a></font></b>";
! $self->{ut__} .= $to . ' ';
! }
}
+
+ $self->increment_word( $mword );
+ return 1;
}
! return 0;
}
***************
*** 1535,1548 ****
print "Header ($header) ($argument)\n" if ($self->{debug__});
- if ( $self->{color__} ) {
- my $color = $self->{bayes__}->get_color( "header:$header" );
-
- my $fix_argument = $argument;
- $fix_argument =~ s/</</g;
- $fix_argument =~ s/>/>/g;
-
- $self->{ut__} = "<b><font color=\"$color\">$header</font></b>: $fix_argument\015\012";
- }
-
# After a discussion with Tim Peters and some looking at emails
# I'd received I discovered that the header names (case sensitive) are
--- 1542,1545 ----
***************
*** 1550,1554 ****
# much spam uses MIME-Version, MiME-Version and Mime-Version
! $self->update_pseudoword( 'header', $header, 0, $header );
# Check the encoding type in all RFC 2047 encoded headers
--- 1547,1564 ----
# much spam uses MIME-Version, MiME-Version and Mime-Version
! my $fix_argument = $argument;
! $fix_argument =~ s/</</g;
! $fix_argument =~ s/>/>/g;
!
! if ( $self->update_pseudoword( 'header', $header, 0, $header ) ) {
! if ( $self->{color__} ) {
! my $color = $self->{bayes__}->get_color( "header:$header" );
! $self->{ut__} = "<b><font color=\"$color\">$header</font></b>: $fix_argument\015\012";
! }
! } else {
! if ( $self->{color__} ) {
! $self->{ut__} = "$header: $fix_argument\015\012";
! }
! }
# Check the encoding type in all RFC 2047 encoded headers
Index: WordMangle.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/WordMangle.pm,v
retrieving revision 1.31
retrieving revision 1.32
diff -C2 -d -r1.31 -r1.32
*** WordMangle.pm 13 Oct 2003 20:23:40 -0000 1.31
--- WordMangle.pm 28 Oct 2003 01:06:46 -0000 1.32
***************
*** 112,132 ****
# All words are treated as lowercase
! $word = lc($word);
# Stop words are ignored
! return '' if ( ( $self->{stop__}{$word} ) && ( !defined( $ignore_stops ) ) );
# Remove characters that would mess up a Perl regexp and replace with .
! $word =~ s/(\+|\/|\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.)/\./g;
# Long words are ignored also
! return '' if ( length($word) > 45 );
# Ditch long hex numbers
! return '' if ( $word =~ /^[A-F0-9]{8,}$/i );
# Colons are forbidden inside words, we should never get passed a word
--- 112,134 ----
# All words are treated as lowercase
! my $lcword = lc($word);
# Stop words are ignored
! return '' if ( ( ( $self->{stop__}{$lcword} ) ||
! ( $self->{stop__}{$word} ) ) &&
! ( !defined( $ignore_stops ) ) );
# Remove characters that would mess up a Perl regexp and replace with .
! $lcword =~ s/(\+|\/|\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.)/\./g;
# Long words are ignored also
! return '' if ( length($lcword) > 45 );
# Ditch long hex numbers
! return '' if ( $lcword =~ /^[A-F0-9]{8,}$/i );
# Colons are forbidden inside words, we should never get passed a word
***************
*** 135,141 ****
# for more details
! $word =~ s/://g if ( !defined( $allow_colon ) );
! return $word;
}
--- 137,143 ----
# for more details
! $lcword =~ s/://g if ( !defined( $allow_colon ) );
! return ($word =~ /:/ )?$word:$lcword;
}
***************
*** 163,172 ****
}
} else {
! if ( $stopword =~ /[^[:lower:]\-_\.\@0-9]/i ) {
return 0;
}
}
! $stopword = $self->mangle( $stopword, 0, 1 );
if ( $stopword ne '' ) {
--- 165,174 ----
}
} else {
! if ( ( $stopword !~ /:/ ) && ( $stopword =~ /[^[:lower:]\-_\.\@0-9]/i ) ) {
return 0;
}
}
! $stopword = $self->mangle( $stopword, 1, 1 );
if ( $stopword ne '' ) {
***************
*** 191,200 ****
}
} else {
! if ( $stopword =~ /[^[:lower:]\-_\.\@0-9]/i ) {
return 0;
}
}
! $stopword = $self->mangle( $stopword, 0, 1 );
if ( $stopword ne '' ) {
--- 193,202 ----
}
} else {
! if ( ( $stopword !~ /:/ ) && ( $stopword =~ /[^[:lower:]\-_\.\@0-9]/i ) ) {
return 0;
}
}
! $stopword = $self->mangle( $stopword, 1, 1 );
if ( $stopword ne '' ) {
|