[popfile-commit] engine traintest.pl,1.3,1.4

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/popfile/engine
In directory sc8-pr-cvs1:/tmp/cvs-serv23243

Modified Files:
	traintest.pl 
Log Message:
add corpus output option: -dump 1
will output the accumulated corpus to the "archive_corpus" subdirectory

Index: traintest.pl
===================================================================
RCS file: /cvsroot/popfile/engine/traintest.pl,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** traintest.pl	16 Mar 2003 01:32:31 -0000	1.3
--- traintest.pl	12 Apr 2003 07:20:50 -0000	1.4
***************
*** 21,24 ****
--- 21,26 ----
  my $DEFAULT_CLASSIFIER = "bayes";
  my $DEFAULT_ARCHIVE = "archive";
+ my $DEFAULT_DUMP = "0";
+ my $DEFAULT_CORPUS = "archive_corpus";
  
  $| = 1;
***************
*** 70,73 ****
--- 72,77 ----
      $config->parameter("classifier",$DEFAULT_CLASSIFIER);
      $config->parameter("archive_dir",$DEFAULT_ARCHIVE);
+     $config->parameter("dump",$DEFAULT_DUMP);
+     $config->parameter("corpus_out",$DEFAULT_CORPUS);
  
  }
***************
*** 132,147 ****
          my $wordvalue = $wordtab{$bucket."|".$word};
          $wordtab{$bucket."|".$word} += $b->{parser__}->{words__}{$word};
! #        my $wordvalue = $b->get_value($bucket, $word);
! #        $b->set_value($bucket,$word, $wordvalue + $b->{parser}->{words}{$word} );
! #        $b->set_value($bucket,$word, $wordtab{$bucket."|".$word});
          $b->{total__}{$bucket}        += $b->{parser__}->{words__}{$word};
          $b->{unique__}{$bucket}       += 1 if ($wordvalue == 0);
      }
      $b->{full_total__} += $b->{parser__}{msg_total__};
-     foreach my $word (keys %wordtab) {
-         if ( $word =~ /^\Q$bucket\E\|(.*)$/ ) {
-            $b->set_value_($bucket,$1, $wordtab{$word});
-         }
-     }
      $b->update_constants_();
  }
--- 136,144 ----
          my $wordvalue = $wordtab{$bucket."|".$word};
          $wordtab{$bucket."|".$word} += $b->{parser__}->{words__}{$word};
!         $b->set_value_($bucket,$word, $wordtab{$bucket."|".$word});
          $b->{total__}{$bucket}        += $b->{parser__}->{words__}{$word};
          $b->{unique__}{$bucket}       += 1 if ($wordvalue == 0);
      }
      $b->{full_total__} += $b->{parser__}{msg_total__};
      $b->update_constants_();
  }
***************
*** 164,167 ****
--- 161,192 ----
  }
  
+ sub dump_corpus
+ {
+     my ($self) = @_;
+ 
+     my $dir = $self->{configuration__}->parameter('corpus_out');
+     mkdir($dir);
+     
+     foreach my $abucket ( keys %{$self->{total__}} ) {
+         
+         print "saving $abucket corpus.\n";
+         
+         my $subdir = $dir;
+         $subdir .= "/$abucket";
+ 
+         mkdir($subdir);
+ 
+         open CORPUS, ">$dir/$abucket/table";
+         print CORPUS "__CORPUS__ __VERSION__ 1\n";
+         for my $ord ( @{$self->get_bucket_word_list($abucket)} ) {
+             if ( defined($ord) ) {
+                 while ($ord =~ s/\|([^ ]+) (\d+)\|//) {
+                     print CORPUS "$1 $2\n";
+                 }
+             }
+         }        
+     }    
+ }
+ 
  
  
***************
*** 255,264 ****
      initialize( $c );
  
!     $c->load_configuration();    
      
      $c->parse_command_line();
      
  #    $b->{unclassified} = ($c->parameter('unclassified_probability') || 0.0001);
!     $b->{unclassified__} = ($c->parameter("bayes_unclassified_probability") || 0.5);
      
      # test with or without stop-words    
--- 280,289 ----
      initialize( $c );
  
!     $c->load_configuration();
      
      $c->parse_command_line();
      
  #    $b->{unclassified} = ($c->parameter('unclassified_probability') || 0.0001);
!     $b->{unclassified__} = log($c->parameter("bayes_unclassified_probability") || 0.5);
      
      # test with or without stop-words    
***************
*** 269,273 ****
      
  
!     my $archive = $c->parameter("ui_archive_dir");
  
  
--- 294,298 ----
      
  
!     my $archive = $c->parameter("html_archive_dir");
  
  
***************
*** 398,401 ****
--- 423,432 ----
  
      my $end_time = time;
+     
+     if ($c->parameter("dump")) {
+         dump_corpus($b);
+         
+     }
+     
  
      my $total_messages = $#sorted_messages + 1;
***************
*** 417,420 ****
--- 448,453 ----
      print "     -toe:           Train Only Errors, defaults to $DEFAULT_TOE\n";
      print "     -stopwords:     Use stop-words, defaults to $DEFAULT_STOP\n";
+     print "     -dump:          Outputs accumulated corpus, defaults to $DEFAULT_DUMP\n";
+     print "     -corpus_out:    Location to save output corpus, defaults to $DEFAULT_CORPUS\n";
  }