|
From: <jgr...@us...> - 2003-09-03 20:47:51
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv30888/Classifier
Modified Files:
Bayes.pm
Log Message:
Implemented add one smoothing of the loaded corpus, change the unclassified_probability to unclassified_weight, it is now used to compare the difference between the top chosen bucket and the second, the top must be unclassified_weight times greater than the second to avoid an unclassified
Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.182
retrieving revision 1.183
diff -C2 -d -r1.182 -r1.183
*** Bayes.pm 3 Sep 2003 19:37:06 -0000 1.182
--- Bayes.pm 3 Sep 2003 20:47:47 -0000 1.183
***************
*** 102,107 ****
$self->{magnet_count__} = 0;
! # The unclassified cutoff probability
! $self->{unclassified__} = 0.5;
# Used to tell the caller whether a magnet was used in the last
--- 102,108 ----
$self->{magnet_count__} = 0;
! # The unclassified cutoff this value means that the top probabilily must be n times greater than the
! # second probability, default is 100 times more likely
! $self->{unclassified__} = log(100);
# Used to tell the caller whether a magnet was used in the last
***************
*** 130,135 ****
my ( $self ) = @_;
! # No default unclassified probability
! $self->config_( 'unclassified_probability', 0 );
# The corpus is kept in the 'corpus' subfolder of POPFile
--- 131,137 ----
my ( $self ) = @_;
! # No default unclassified weight is the number of times more sure POPFile
! # must be of the top class vs the second class, default is 100 times more
! $self->config_( 'unclassified_weight', 100 );
# The corpus is kept in the 'corpus' subfolder of POPFile
***************
*** 165,174 ****
my ( $self ) = @_;
! if ( $self->config_( 'unclassified_probability' ) != 0 ) {
! $self->{unclassified__} = $self->config_( 'unclassified_probability' );
! }
!
! $self->{unclassified__} = log($self->{unclassified__});
!
$self->load_word_matrix_();
--- 167,171 ----
my ( $self ) = @_;
! $self->{unclassified__} = log( $self->config_( 'unclassified_weight' ) );
$self->load_word_matrix_();
***************
*** 520,523 ****
--- 517,526 ----
my $value = $2;
if ( $value > 0 ) {
+
+ # Here we do a simple "add one" smoothing on the data being
+ # loaded
+
+ $value += 1;
+
$self->{total__}{$bucket} += $value;
$self->{unique__}{$bucket} += 1;
***************
*** 806,812 ****
my $class = 'unclassified';
! # if ( ( $total != 0 ) && ( $score{$ranking[0]} > $self->{unclassified__} + log($total) ) ) {
$class = $ranking[0];
! # }
return $class;
--- 809,815 ----
my $class = 'unclassified';
! if ( $score{$ranking[0]} > ( $score{$ranking[1]} + $self->{unclassified__} ) ) {
$class = $ranking[0];
! }
return $class;
|