|
From: <jgr...@us...> - 2003-03-03 15:22:23
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv12042/Classifier
Modified Files:
Bayes.pm MailParse.pm WordMangle.pm
Log Message:
Partial and broken work on POPFile refactoring; READ ONLY at this point; do not bother running unless you are very brave
Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.107
retrieving revision 1.108
diff -C2 -d -r1.107 -r1.108
*** Bayes.pm 28 Feb 2003 00:21:03 -0000 1.107
--- Bayes.pm 3 Mar 2003 15:21:37 -0000 1.108
***************
*** 2,5 ****
--- 2,8 ----
package Classifier::Bayes;
+ use POPFile::Module;
+ @ISA = ("POPFile::Module");
+
# ---------------------------------------------------------------------------------------------
#
***************
*** 20,24 ****
[...1348 lines suppressed...]
!
! # ---------------------------------------------------------------------------------------------
! #
! # set_bucket_parameter
! #
! # Sets the value associated with a bucket specific parameter
! #
! # $bucket The name of the bucket
! # $parameter The name of the parameter
! # $value The new value
! #
! # ---------------------------------------------------------------------------------------------
!
! sub set_bucket_parameter
! {
! my ( $self, $bucket, $parameter, $value ) = @_;
!
! $self->{parameters__}{$bucket}{$parameter} = $value;
}
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.91
retrieving revision 1.92
diff -C2 -d -r1.91 -r1.92
*** MailParse.pm 27 Feb 2003 09:49:38 -0000 1.91
--- MailParse.pm 3 Mar 2003 15:21:39 -0000 1.92
***************
*** 21,25 ****
my %entityhash;
! @entityhash{'amp', 'nbsp','iexcl','cent','pound','curren','yen','brvbar','sect','uml','copy','ordf','laquo','not','shy','reg','macr','deg','plusmn','sup2','sup3','acute','micro','para','middot','cedil','sup1','ordm','raquo','frac14','frac12','frac34','iquest','Agrave','Aacute','Acirc','Atilde','Auml','Aring','AElig','Ccedil','Egrave','Eacute','Ecirc','Euml','Igrave','Iacute','Icirc','Iuml','ETH','Ntilde','Ograve','Oacute','Ocirc','Otilde','Ouml','times','Oslash','Ugrave','Uacute','Ucirc','Uuml','Yacute','THORN','szlig','agrave','aacute','acirc','atilde','auml','aring','aelig','ccedil','egrave','eacute','ecirc','euml','igrave','iacute','icirc','iuml','eth','ntilde','ograve','oacute','ocirc','otilde','ouml','divide','oslash','ugrave','uacute','ucirc','uuml','yacute','thorn','yuml'} = ( 38, 160,161,162,163,164,165,166,167,168,169,170,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255 );
#----------------------------------------------------------------------------
--- 21,25 ----
my %entityhash;
! @entityhash{'amp', 'nbsp','iexcl','cent','pound','curren','yen','brvbar','sect','uml','copy','ordf','laquo','not','shy','reg','macr','deg','plusmn','sup2','sup3','acute','micro','para','middot','cedil','sup1','ordm','raquo','frac14','frac12','frac34','iquest','Agrave','Aacute','Acirc','Atilde','Auml','Aring','AElig','Ccedil','Egrave','Eacute','Ecirc','Euml','Igrave','Iacute','Icirc','Iuml','ETH','Ntilde','Ograve','Oacute','Ocirc','Otilde','Ouml','times','Oslash','Ugrave','Uacute','Ucirc','Uuml','Yacute','THORN','szlig','agrave','aacute','acirc','atilde','auml','aring','aelig','ccedil','egrave','eacute','ecirc','euml','igrave','iacute','icirc','iuml','eth','ntilde','ograve','oacute','ocirc','otilde','ouml','divide','oslash','ugrave','uacute','ucirc','uuml','yacute','thorn','yuml'} = ( 38, 160,161,162,163,164,165,166,167,168,169,170,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255 );
[...1700 lines suppressed...]
}
}
!
! $self->{base64__} = '';
!
return $colorized;
}
***************
*** 1114,1118 ****
#
# =?charset?[BQ]?text?=
! #
# A B indicates base64 encoding, a Q indicates quoted printable encoding
# ---------------------------------------------------------------------------------------------
--- 1111,1115 ----
#
# =?charset?[BQ]?text?=
! #
# A B indicates base64 encoding, a Q indicates quoted printable encoding
# ---------------------------------------------------------------------------------------------
Index: WordMangle.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/WordMangle.pm,v
retrieving revision 1.20
retrieving revision 1.21
diff -C2 -d -r1.20 -r1.21
*** WordMangle.pm 22 Jan 2003 18:32:33 -0000 1.20
--- WordMangle.pm 3 Mar 2003 15:21:41 -0000 1.21
***************
*** 24,28 ****
my $self;
! $self->{stop} = {
'all', 1, 'also', 1, 'and', 1, 'any', 1, 'are', 1, 'ask', 1, 'but', 1, 'can', 1, 'com', 1, 'did', 1, 'edu', 1, 'etc', 1, 'for', 1, 'from', 1, 'had', 1, 'has', 1,
'have', 1, 'her', 1, 'him', 1, 'his', 1, 'inc', 1, 'its', 1, 'it\'s', 1, 'ltd', 1, 'may', 1, 'not', 1, 'off', 1, 'our', 1, 'out', 1, 'she', 1, 'the', 1, 'this',
--- 24,28 ----
my $self;
! $self->{stop__} = {
'all', 1, 'also', 1, 'and', 1, 'any', 1, 'are', 1, 'ask', 1, 'but', 1, 'can', 1, 'com', 1, 'did', 1, 'edu', 1, 'etc', 1, 'for', 1, 'from', 1, 'had', 1, 'has', 1,
'have', 1, 'her', 1, 'him', 1, 'his', 1, 'inc', 1, 'its', 1, 'it\'s', 1, 'ltd', 1, 'may', 1, 'not', 1, 'off', 1, 'our', 1, 'out', 1, 'she', 1, 'the', 1, 'this',
***************
*** 54,65 ****
{
my ($self) = @_;
!
if ( open STOPS, "<stopwords" ) {
! delete $self->{stop};
while ( <STOPS> ) {
s/[\r\n]//g;
! $self->{stop}{$_} = 1;
}
!
close STOPS;
}
--- 54,65 ----
{
my ($self) = @_;
!
if ( open STOPS, "<stopwords" ) {
! delete $self->{stop__};
while ( <STOPS> ) {
s/[\r\n]//g;
! $self->{stop__}{$_} = 1;
}
!
close STOPS;
}
***************
*** 69,78 ****
{
my ($self) = @_;
!
if ( open STOPS, ">stopwords" ) {
! for my $word (keys %{$self->{stop}}) {
print STOPS "$word\n";
}
!
close STOPS;
}
--- 69,78 ----
{
my ($self) = @_;
!
if ( open STOPS, ">stopwords" ) {
! for my $word (keys %{$self->{stop__}}) {
print STOPS "$word\n";
}
!
close STOPS;
}
***************
*** 99,119 ****
# All words are treated as lowercase
!
$word = lc($word);
# Stop words are ignored
!
! return '' if ( $self->{stop}{$word} );
# Remove characters that would mess up a Perl regexp and replace with .
!
$word =~ s/(\+|\/|\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.)/\./g;
# Long words are ignored also
!
return '' if ( length($word) > 45 );
# Ditch long hex numbers
!
return '' if ( $word =~ /^[A-F0-9]{8,}$/i );
--- 99,119 ----
# All words are treated as lowercase
!
$word = lc($word);
# Stop words are ignored
!
! return '' if ( $self->{stop__}{$word} );
# Remove characters that would mess up a Perl regexp and replace with .
!
$word =~ s/(\+|\/|\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.)/\./g;
# Long words are ignored also
!
return '' if ( length($word) > 45 );
# Ditch long hex numbers
!
return '' if ( $word =~ /^[A-F0-9]{8,}$/i );
|