|
From: <ssc...@us...> - 2003-04-12 07:20:54
|
Update of /cvsroot/popfile/engine
In directory sc8-pr-cvs1:/tmp/cvs-serv23243
Modified Files:
traintest.pl
Log Message:
add corpus output option: -dump 1
will output the accumulated corpus to the "archive_corpus" subdirectory
Index: traintest.pl
===================================================================
RCS file: /cvsroot/popfile/engine/traintest.pl,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** traintest.pl 16 Mar 2003 01:32:31 -0000 1.3
--- traintest.pl 12 Apr 2003 07:20:50 -0000 1.4
***************
*** 21,24 ****
--- 21,26 ----
my $DEFAULT_CLASSIFIER = "bayes";
my $DEFAULT_ARCHIVE = "archive";
+ my $DEFAULT_DUMP = "0";
+ my $DEFAULT_CORPUS = "archive_corpus";
$| = 1;
***************
*** 70,73 ****
--- 72,77 ----
$config->parameter("classifier",$DEFAULT_CLASSIFIER);
$config->parameter("archive_dir",$DEFAULT_ARCHIVE);
+ $config->parameter("dump",$DEFAULT_DUMP);
+ $config->parameter("corpus_out",$DEFAULT_CORPUS);
}
***************
*** 132,147 ****
my $wordvalue = $wordtab{$bucket."|".$word};
$wordtab{$bucket."|".$word} += $b->{parser__}->{words__}{$word};
! # my $wordvalue = $b->get_value($bucket, $word);
! # $b->set_value($bucket,$word, $wordvalue + $b->{parser}->{words}{$word} );
! # $b->set_value($bucket,$word, $wordtab{$bucket."|".$word});
$b->{total__}{$bucket} += $b->{parser__}->{words__}{$word};
$b->{unique__}{$bucket} += 1 if ($wordvalue == 0);
}
$b->{full_total__} += $b->{parser__}{msg_total__};
- foreach my $word (keys %wordtab) {
- if ( $word =~ /^\Q$bucket\E\|(.*)$/ ) {
- $b->set_value_($bucket,$1, $wordtab{$word});
- }
- }
$b->update_constants_();
}
--- 136,144 ----
my $wordvalue = $wordtab{$bucket."|".$word};
$wordtab{$bucket."|".$word} += $b->{parser__}->{words__}{$word};
! $b->set_value_($bucket,$word, $wordtab{$bucket."|".$word});
$b->{total__}{$bucket} += $b->{parser__}->{words__}{$word};
$b->{unique__}{$bucket} += 1 if ($wordvalue == 0);
}
$b->{full_total__} += $b->{parser__}{msg_total__};
$b->update_constants_();
}
***************
*** 164,167 ****
--- 161,192 ----
}
+ sub dump_corpus
+ {
+ my ($self) = @_;
+
+ my $dir = $self->{configuration__}->parameter('corpus_out');
+ mkdir($dir);
+
+ foreach my $abucket ( keys %{$self->{total__}} ) {
+
+ print "saving $abucket corpus.\n";
+
+ my $subdir = $dir;
+ $subdir .= "/$abucket";
+
+ mkdir($subdir);
+
+ open CORPUS, ">$dir/$abucket/table";
+ print CORPUS "__CORPUS__ __VERSION__ 1\n";
+ for my $ord ( @{$self->get_bucket_word_list($abucket)} ) {
+ if ( defined($ord) ) {
+ while ($ord =~ s/\|([^ ]+) (\d+)\|//) {
+ print CORPUS "$1 $2\n";
+ }
+ }
+ }
+ }
+ }
+
***************
*** 255,264 ****
initialize( $c );
! $c->load_configuration();
$c->parse_command_line();
# $b->{unclassified} = ($c->parameter('unclassified_probability') || 0.0001);
! $b->{unclassified__} = ($c->parameter("bayes_unclassified_probability") || 0.5);
# test with or without stop-words
--- 280,289 ----
initialize( $c );
! $c->load_configuration();
$c->parse_command_line();
# $b->{unclassified} = ($c->parameter('unclassified_probability') || 0.0001);
! $b->{unclassified__} = log($c->parameter("bayes_unclassified_probability") || 0.5);
# test with or without stop-words
***************
*** 269,273 ****
! my $archive = $c->parameter("ui_archive_dir");
--- 294,298 ----
! my $archive = $c->parameter("html_archive_dir");
***************
*** 398,401 ****
--- 423,432 ----
my $end_time = time;
+
+ if ($c->parameter("dump")) {
+ dump_corpus($b);
+
+ }
+
my $total_messages = $#sorted_messages + 1;
***************
*** 417,420 ****
--- 448,453 ----
print " -toe: Train Only Errors, defaults to $DEFAULT_TOE\n";
print " -stopwords: Use stop-words, defaults to $DEFAULT_STOP\n";
+ print " -dump: Outputs accumulated corpus, defaults to $DEFAULT_DUMP\n";
+ print " -corpus_out: Location to save output corpus, defaults to $DEFAULT_CORPUS\n";
}
|