From: <jgr...@us...> - 2003-03-03 15:22:23
|
Update of /cvsroot/popfile/engine/Classifier In directory sc8-pr-cvs1:/tmp/cvs-serv12042/Classifier Modified Files: Bayes.pm MailParse.pm WordMangle.pm Log Message: Partial and broken work on POPFile refactoring; READ ONLY at this point; do not bother running unless you are very brave Index: Bayes.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v retrieving revision 1.107 retrieving revision 1.108 diff -C2 -d -r1.107 -r1.108 *** Bayes.pm 28 Feb 2003 00:21:03 -0000 1.107 --- Bayes.pm 3 Mar 2003 15:21:37 -0000 1.108 *************** *** 2,5 **** --- 2,8 ---- package Classifier::Bayes; + use POPFile::Module; + @ISA = ("POPFile::Module"); + # --------------------------------------------------------------------------------------------- # *************** *** 20,24 **** [...1348 lines suppressed...] ! ! # --------------------------------------------------------------------------------------------- ! # ! # set_bucket_parameter ! # ! # Sets the value associated with a bucket specific parameter ! # ! # $bucket The name of the bucket ! # $parameter The name of the parameter ! # $value The new value ! # ! # --------------------------------------------------------------------------------------------- ! ! sub set_bucket_parameter ! { ! my ( $self, $bucket, $parameter, $value ) = @_; ! ! $self->{parameters__}{$bucket}{$parameter} = $value; } Index: MailParse.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v retrieving revision 1.91 retrieving revision 1.92 diff -C2 -d -r1.91 -r1.92 *** MailParse.pm 27 Feb 2003 09:49:38 -0000 1.91 --- MailParse.pm 3 Mar 2003 15:21:39 -0000 1.92 *************** *** 21,25 **** my %entityhash; ! @entityhash{'amp', 'nbsp','iexcl','cent','pound','curren','yen','brvbar','sect','uml','copy','ordf','laquo','not','shy','reg','macr','deg','plusmn','sup2','sup3','acute','micro','para','middot','cedil','sup1','ordm','raquo','frac14','frac12','frac34','iquest','Agrave','Aacute','Acirc','Atilde','Auml','Aring','AElig','Ccedil','Egrave','Eacute','Ecirc','Euml','Igrave','Iacute','Icirc','Iuml','ETH','Ntilde','Ograve','Oacute','Ocirc','Otilde','Ouml','times','Oslash','Ugrave','Uacute','Ucirc','Uuml','Yacute','THORN','szlig','agrave','aacute','acirc','atilde','auml','aring','aelig','ccedil','egrave','eacute','ecirc','euml','igrave','iacute','icirc','iuml','eth','ntilde','ograve','oacute','ocirc','otilde','ouml','divide','oslash','ugrave','uacute','ucirc','uuml','yacute','thorn','yuml'} = ( 38, 160,161,162,163,164,165,166,167,168,169,170,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255 ); #---------------------------------------------------------------------------- --- 21,25 ---- my %entityhash; ! @entityhash{'amp', 'nbsp','iexcl','cent','pound','curren','yen','brvbar','sect','uml','copy','ordf','laquo','not','shy','reg','macr','deg','plusmn','sup2','sup3','acute','micro','para','middot','cedil','sup1','ordm','raquo','frac14','frac12','frac34','iquest','Agrave','Aacute','Acirc','Atilde','Auml','Aring','AElig','Ccedil','Egrave','Eacute','Ecirc','Euml','Igrave','Iacute','Icirc','Iuml','ETH','Ntilde','Ograve','Oacute','Ocirc','Otilde','Ouml','times','Oslash','Ugrave','Uacute','Ucirc','Uuml','Yacute','THORN','szlig','agrave','aacute','acirc','atilde','auml','aring','aelig','ccedil','egrave','eacute','ecirc','euml','igrave','iacute','icirc','iuml','eth','ntilde','ograve','oacute','ocirc','otilde','ouml','divide','oslash','ugrave','uacute','ucirc','uuml','yacute','thorn','yuml'} = ( 38, 160,161,162,163,164,165,166,167,168,169,170,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255 ); [...1700 lines suppressed...] } } ! ! $self->{base64__} = ''; ! return $colorized; } *************** *** 1114,1118 **** # # =?charset?[BQ]?text?= ! # # A B indicates base64 encoding, a Q indicates quoted printable encoding # --------------------------------------------------------------------------------------------- --- 1111,1115 ---- # # =?charset?[BQ]?text?= ! # # A B indicates base64 encoding, a Q indicates quoted printable encoding # --------------------------------------------------------------------------------------------- Index: WordMangle.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/WordMangle.pm,v retrieving revision 1.20 retrieving revision 1.21 diff -C2 -d -r1.20 -r1.21 *** WordMangle.pm 22 Jan 2003 18:32:33 -0000 1.20 --- WordMangle.pm 3 Mar 2003 15:21:41 -0000 1.21 *************** *** 24,28 **** my $self; ! $self->{stop} = { 'all', 1, 'also', 1, 'and', 1, 'any', 1, 'are', 1, 'ask', 1, 'but', 1, 'can', 1, 'com', 1, 'did', 1, 'edu', 1, 'etc', 1, 'for', 1, 'from', 1, 'had', 1, 'has', 1, 'have', 1, 'her', 1, 'him', 1, 'his', 1, 'inc', 1, 'its', 1, 'it\'s', 1, 'ltd', 1, 'may', 1, 'not', 1, 'off', 1, 'our', 1, 'out', 1, 'she', 1, 'the', 1, 'this', --- 24,28 ---- my $self; ! $self->{stop__} = { 'all', 1, 'also', 1, 'and', 1, 'any', 1, 'are', 1, 'ask', 1, 'but', 1, 'can', 1, 'com', 1, 'did', 1, 'edu', 1, 'etc', 1, 'for', 1, 'from', 1, 'had', 1, 'has', 1, 'have', 1, 'her', 1, 'him', 1, 'his', 1, 'inc', 1, 'its', 1, 'it\'s', 1, 'ltd', 1, 'may', 1, 'not', 1, 'off', 1, 'our', 1, 'out', 1, 'she', 1, 'the', 1, 'this', *************** *** 54,65 **** { my ($self) = @_; ! if ( open STOPS, "<stopwords" ) { ! delete $self->{stop}; while ( <STOPS> ) { s/[\r\n]//g; ! $self->{stop}{$_} = 1; } ! close STOPS; } --- 54,65 ---- { my ($self) = @_; ! if ( open STOPS, "<stopwords" ) { ! delete $self->{stop__}; while ( <STOPS> ) { s/[\r\n]//g; ! $self->{stop__}{$_} = 1; } ! close STOPS; } *************** *** 69,78 **** { my ($self) = @_; ! if ( open STOPS, ">stopwords" ) { ! for my $word (keys %{$self->{stop}}) { print STOPS "$word\n"; } ! close STOPS; } --- 69,78 ---- { my ($self) = @_; ! if ( open STOPS, ">stopwords" ) { ! for my $word (keys %{$self->{stop__}}) { print STOPS "$word\n"; } ! close STOPS; } *************** *** 99,119 **** # All words are treated as lowercase ! $word = lc($word); # Stop words are ignored ! ! return '' if ( $self->{stop}{$word} ); # Remove characters that would mess up a Perl regexp and replace with . ! $word =~ s/(\+|\/|\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.)/\./g; # Long words are ignored also ! return '' if ( length($word) > 45 ); # Ditch long hex numbers ! return '' if ( $word =~ /^[A-F0-9]{8,}$/i ); --- 99,119 ---- # All words are treated as lowercase ! $word = lc($word); # Stop words are ignored ! ! return '' if ( $self->{stop__}{$word} ); # Remove characters that would mess up a Perl regexp and replace with . ! $word =~ s/(\+|\/|\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.)/\./g; # Long words are ignored also ! return '' if ( length($word) > 45 ); # Ditch long hex numbers ! return '' if ( $word =~ /^[A-F0-9]{8,}$/i ); |