|
From: <ssc...@us...> - 2003-03-13 03:08:41
|
Update of /cvsroot/popfile/engine
In directory sc8-pr-cvs1:/tmp/cvs-serv24984
Added Files:
traintest.pl
Log Message:
adding traintest.pl "perl traintest.pl -usage" for basic help, email me for more for now
--- NEW FILE: traintest.pl ---
#!/usr/bin/perl
# ---------------------------------------------------------------------------------------------
#
# traintest.pl - Simulate training on errors using a saved archive.
#
# Copyright (c) 2001-2003 John Graham-Cumming
#
# ---------------------------------------------------------------------------------------------
use strict;
use Classifier::Bayes;
use POPFile::Configuration;
my $debug = 0;
my $DEFAULT_CSV = 'auto';
my $DEFAULT_TOE = 1;
my $DEFAULT_WINDOW1 = 100;
my $DEFAULT_WINDOW2 = 500;
my $DEFAULT_CLASSIFIER = "bayes";
my $DEFAULT_ARCHIVE = "archive";
$| = 1;
$, = ", ";
my %wordtab;
# ---------------------------------------------------------------------------------------------
#
# compare_mf - Compares two mailfiles, used for sorting mail into order
#
# ---------------------------------------------------------------------------------------------
# NB, wasn't able to get this to work as part of HTML.pm
sub compare_mf
{
my $ad;
my $bd;
my $am;
my $bm;
if ( $a =~ /popfile(.*)=(.*)\.msg/ ) {
$ad = $1;
$am = $2;
if ( $b =~ /popfile(.*)=(.*)\.msg/ ) {
$bd = $1;
$bm = $2;
if ( $ad == $bd ) {
return ( $bm <=> $am );
} else {
return ( $bd <=> $ad );
}
}
}
return 0;
}
sub initialize
{
my ($config) = @_;
$config->parameter("csv",$DEFAULT_CSV);
$config->parameter("toe",$DEFAULT_TOE);
$config->parameter("window1",$DEFAULT_WINDOW1);
$config->parameter("window2",$DEFAULT_WINDOW2);
$config->parameter("classifier",$DEFAULT_CLASSIFIER);
$config->parameter("archive_dir",$DEFAULT_ARCHIVE);
}
sub cvs_out
{
my ($config, $file, @log) = @_;
if ($file eq 'auto') {
if ($config->parameter('toe') == 1) {
$file = "toe";
} else {
$file = "te";
}
if ( ( $config->parameter('window1') ne $DEFAULT_WINDOW1 ) && \
( $config->parameter('window2') ne $DEFAULT_WINDOW2 ) ) {
$file .= "$config->parameter('window1')and$config->parameter('window2')";
}
$file .= ".csv";
}
print STDERR "Printing data to $file\n";
open CSV, ">$file";
$, = ",";
print CSV sort keys %{ @log[1] };
foreach my $message (0 .. $#log) {
$message = @log[$message];
foreach my $item (sort keys %{$message}) {
my $value = $message->{$item};
$value =~ s/\"/'/g;
print CSV "\"$value\",";
}
print CSV "\n";
}
print STDERR "Data saved to $file\n";
$, = "";
}
sub reclassify
{
my ($b,$bucket) = @_;
foreach my $word (keys %{ $b->{parser__}->{words__} }) {
my $wordvalue = $wordtab{$bucket."|".$word};
$wordtab{$bucket."|".$word} += $b->{parser__}->{words__}{$word};
# my $wordvalue = $b->get_value($bucket, $word);
# $b->set_value($bucket,$word, $wordvalue + $b->{parser}->{words}{$word} );
# $b->set_value($bucket,$word, $wordtab{$bucket."|".$word});
$b->{total__}{$bucket} += $b->{parser__}->{words__}{$word};
$b->{unique__}{$bucket} += 1 if ($wordvalue == 0);
}
$b->{full_total__} += $b->{parser__}{msg_total__};
foreach my $word (keys %wordtab) {
if ( $word =~ /^\Q$bucket\E\|(.*)$/ ) {
$b->set_value_($bucket,$1, $wordtab{$word});
}
}
$b->update_constants_();
}
sub retrain_decider
{
my ($toe,$predicted,$actual) = @_;
if ($toe == 1) {
return ($predicted ne $actual);
} elsif ($toe == 0) {
return 1;
} elsif ($toe == 2) {
return (rand > .5);
}
}
sub find_messages
{
my ( $archive ) = @_;
my $messages = {};
my @buckets_array = glob "$archive/*";
foreach my $abucket (@buckets_array) {
$abucket =~ s/.*\/(.*)$/$1/;
print "saved bucket $abucket opening\n" if ($debug);
my @dir_messages;
my $subdirs = 1;
my $locations = 0;
#$messages->{messages}{$abucket} = {};
my @subdirectories = glob "$archive/$abucket/*";
foreach my $dir (@subdirectories) {
$dir =~ s/.*\/(.*)$/$1/;
print "$abucket subdirectory $dir opening..." if ($debug);
if ( opendir(DIR,"$archive/$abucket/$dir") ) {
print "(open):\n" if ($debug);
$messages->{buckets}{$abucket}{subdirs}{$dir} = {};
$messages->{locations}[$locations] = "$archive/$abucket/$dir";
$locations++;
$subdirs = 1;
closedir(DIR);
foreach my $message ( glob( "$archive/$abucket/$dir/*.msg" ) ) {
my $shortname = $message;
$shortname =~ s/.*\/(.*)$/$1/;
$messages->{messages}{$shortname}{bucket} = $abucket;
$messages->{messages}{$shortname}{long} = $message;
$messages->{buckets}{$abucket}{messages}{$shortname} = $abucket;
$messages->{buckets}{$abucket}{subdirs}{$dir}{$shortname} = $abucket;
}
} else {
print ":" if ($debug);
$subdirs = 0;
last;
}
}
if ($subdirs == 0) {
print "$abucket root dir opening messages:\n" if $debug;
foreach my $message ( glob( "$archive/$abucket/*.msg") ) {
my $shortname = $message;
$shortname =~ s/.*\/(.*)$/$1/;
$messages->{messages}{$shortname}{bucket} = $abucket;
$messages->{messages}{$shortname}{long} = $message;
$messages->{buckets}{$abucket}{messages}{$shortname} = $abucket;
}
$messages->{locations}[$locations] = "$archive/$abucket";
$locations++;
}
}
return $messages;
}
my $self = {};
# main
if ( @ARGV[0] ne "-usage")
{
my $b = new Classifier::Bayes;
my $c = new POPFile::Configuration;
$b->configuration( $c );
$c->configuration( $c );
$c->initialize();
$b->initialize();
initialize( $c );
$c->load_configuration();
$c->parse_command_line();
# $b->{unclassified} = ($c->parameter('unclassified_probability') || 0.0001);
$b->{unclassified__} = ($c->parameter("bayes_unclassified_probability") || 0.5);
my $archive = $c->parameter("ui_archive_dir");
#load the messages
$self->{messages} = find_messages($archive);
#some debug output to verify bucket structure
if ($debug) {
foreach my $abucket ( keys %{ $self->{messages}->{buckets} } ) {
print "$abucket:" . (keys %{ $self->{messages}->{buckets}{$abucket}{messages} }). "\n";
#print keys %{$self->{messages}->{$abucket}};
foreach my $message (keys %{ $self->{messages}->{buckets}{$abucket}{messages} } ) {
print "\t$message $self->{messages}->{buckets}{$abucket}{messages}{$message}\n";
}
}
}
# build a bucket skeleton
foreach my $abucket ( keys %{ $self->{messages}->{buckets} } ) {
$b->{total__}{$abucket} = 0;
$b->{colors__}{$abucket} = "black";
}
$self->{messages}->{buckets}{unclassified} = {};
#sort the messages
my @sorted_messages = sort compare_mf keys %{$self->{messages}{messages}};
# my @sorted_messages = keys %{$self->{messages}{messages}};
#more debug output
if ($debug) {
print "sorted " . $#sorted_messages . ": ";
foreach my $index (0 .. $#sorted_messages ) {
print " " . @sorted_messages[$index];
}
print "\n\n";
}
$b->{debug} = 0;
$b->{parser__}->{debug} = 0;
my ($bucket_class, $bucket_true, $correct);
my $error_count = 0;
my @errors;
my @clog;
#loop through all saved messages, classify
my $start_time = time;
my $total_size = 0;
foreach my $index (0 .. $#sorted_messages ) {
$correct = 1;
my $message_count = $index + 1;
$index = $#sorted_messages - $index;
print "\n$self->{messages}->{messages}{ @sorted_messages[$index] }{long}:" if ($debug);
$bucket_class = $b->classify_file( $self->{messages}->{messages}{ @sorted_messages[$index] }{long});
$bucket_true = $self->{messages}->{messages}{ @sorted_messages[$index] }{bucket};
if ($bucket_class ne $bucket_true) {
@errors[$error_count] = {};
@errors[$error_count]->{index}= $message_count;
@errors[$error_count]->{subject} = $b->{parser}->{subject};
@errors[$error_count]->{shouldbe} = $bucket_true;
@errors[$error_count]->{classified_as} = $bucket_class;
$error_count++;
$correct = 0;
if ($bucket_class eq 'unclassified') {
print STDERR "/";
} else {
print STDERR "+";
}
} else {
print STDERR ".";
}
if ( retrain_decider($c->parameter('toe'),$bucket_class, $bucket_true) ) {
print STDERR "Reclassifying $message_count $bucket_class => $bucket_true.\n" if ($debug);
reclassify($b,$bucket_true);
}
@clog[$message_count] = {};
@clog[$message_count]->{count} = $message_count;
@clog[$message_count]->{correct} = $correct;
@clog[$message_count]->{error_count} = $error_count;
@clog[$message_count]->{accuracy} = int( 10000 * ( $message_count - $error_count ) / $message_count ) / 100;
@clog[$message_count]->{subject} = $b->{parser}->{subject};
@clog[$message_count]->{from} = $b->{parser}->{from};
@clog[$message_count]->{shouldbe} = $bucket_true;
@clog[$message_count]->{classified_as} = $bucket_class;
@clog[$message_count]->{filename} = @sorted_messages[$index];
foreach my $abucket (keys %{ $self->{messages}->{buckets} } ) {
if ($message_count != 1) {
if ( ( $abucket eq $bucket_class ) && ( $bucket_class ne $bucket_true ) ) {
@clog[$message_count]->{"misclassified_count_$abucket"} = $clog[($message_count - 1)]->{"misclassified_count_$abucket"} + 1;
} else {
@clog[$message_count]->{"misclassified_count_$abucket"} = $clog[($message_count - 1)]->{"misclassified_count_$abucket"};
}
} else {
@clog[$message_count]->{"misclassified_count_$abucket"} = ((($bucket_true ne $bucket_class) && ($bucket_class eq $abucket))?1:0);
}
}
my $window_size = $c->parameter("window2");
my $window_errors = 0;
foreach my $error (0 .. $#errors) {
if ( @errors[$error]->{index} > ($message_count - $window_size) ) {
$window_errors++;
}
}
@clog[$message_count]->{window_errors} = $window_errors;
$window_size = (($window_size > $message_count)?$message_count:$window_size);
@clog[$message_count]->{moving_accuracy} = int( 10000 * ( $window_size - $window_errors ) / $window_size ) / 100;
}
print STDERR "\n";
my $end_time = time;
my $total_messages = $#sorted_messages + 1;
my $accuracy = int( 10000 * ( $total_messages - $error_count ) / $total_messages ) / 100;
print STDERR "process took " . ($end_time - $start_time) . " seconds\n";
print STDERR "$error_count out of " . $total_messages . " incorrect\n";
print STDERR "that's $accuracy% accurate\n";
cvs_out($c, $c->parameter('csv'), @clog) if ($c->parameter('csv') ne "none");
} else {
print "traintest.pl - perform classification tests on archived messages\n\n";
print "Usage: traintest.pl [-usage] [-csv auto|filename] [-parameter value [-parameter value] ...]\n";
print " -usage: Displays this screen\n";
print " Other Parameters Use\n";
print " -archive_dir: Location to seek an archive\n";
print " -csv: Filename to save CSV log to, \"auto\" generates a filename\n";
print " -toe: Train Only Errors, defaults to $DEFAULT_TOE";
}
|