|
From: <ssc...@us...> - 2003-05-27 21:21:39
|
Update of /cvsroot/popfile/engine
In directory sc8-pr-cvs1:/tmp/cvs-serv12573
Added Files:
xval.pl
Log Message:
add rudimentary cross-validation script
works with archived messages from
-archive 1 -archive_classes 10
--- NEW FILE: xval.pl ---
#!/usr/bin/perl
# ---------------------------------------------------------------------------------------------
#
# traintest.pl - Simulate training on errors using a saved archive.
#
# Copyright (c) 2001-2003 John Graham-Cumming
#
# ---------------------------------------------------------------------------------------------
use strict;
use Classifier::Bayes;
use POPFile::Configuration;
my $debug = 0;
my $DEFAULT_XCNT = '10';
my $DEFAULT_CSV = 'auto';
my $DEFAULT_STOP = 1;
my $DEFAULT_CLASSIFIER = "bayes";
my $DEFAULT_ARCHIVE = "archive";
my $DEFAULT_DUMP = "0";
my $DEFAULT_CORPUS = "archive_corpus";
$| = 1;
$, = ", ";
my %wordtab;
sub initialize
{
my ($config) = @_;
$config->parameter("xcnt",$DEFAULT_XCNT);
$config->parameter("csv",$DEFAULT_CSV);
$config->parameter("stopwords",$DEFAULT_STOP);
$config->parameter("classifier",$DEFAULT_CLASSIFIER);
$config->parameter("archive_dir",$DEFAULT_ARCHIVE);
$config->parameter("dump",$DEFAULT_DUMP);
$config->parameter("corpus_out",$DEFAULT_CORPUS);
}
sub cvs_out
{
my ($config, $file, @log) = @_;
if ($file eq 'auto') {
$file = "x_validate";
if ( $config->parameter('stopwords') ne $DEFAULT_STOP ) {
$file .= "_";
if ($config->parameter('stopwords') != 1) {
$file .= "no";
}
$file .= "stop";
}
$file .= ".csv";
}
print STDERR "Printing data to $file\n";
open CSV, ">$file";
$, = ",";
print CSV sort keys %{ @log[1] };
foreach my $message (0 .. $#log) {
$message = @log[$message];
foreach my $item (sort keys %{$message}) {
my $value = $message->{$item};
$value =~ s/\"/'/g;
print CSV "\"$value\",";
}
print CSV "\n";
}
print STDERR "Data saved to $file\n";
$, = "";
}
sub reclassify_file
{
my ($b,$bucket) = @_;
foreach my $word (keys %{ $b->{parser__}->{words__} }) {
my $wordvalue = $wordtab{$bucket."|".$word};
$wordtab{$bucket."|".$word} += $b->{parser__}->{words__}{$word};
$b->set_value_($bucket,$word, $wordtab{$bucket."|".$word});
$b->{total__}{$bucket} += $b->{parser__}->{words__}{$word};
$b->{unique__}{$bucket} += 1 if ($wordvalue == 0);
}
$b->{full_total__} += $b->{parser__}{msg_total__};
$b->update_constants_();
}
sub reclassify_folder
{
my ($b,$folder,$bucket) = @_;
my @file_array = glob "$folder/*";
foreach my $amessage (@file_array) {
}
}
sub dump_corpus
{
my ($self) = @_;
my $dir = $self->{configuration__}->parameter('corpus_out');
mkdir($dir);
foreach my $abucket ( keys %{$self->{total__}} ) {
print "saving $abucket corpus.\n";
my $subdir = $dir;
$subdir .= "/$abucket";
mkdir($subdir);
open CORPUS, ">$dir/$abucket/table";
print CORPUS "__CORPUS__ __VERSION__ 1\n";
for my $ord ( @{$self->get_bucket_word_list($abucket)} ) {
if ( defined($ord) ) {
while ($ord =~ s/\|([^ ]+) (\d+)\|//) {
print CORPUS "$1 $2\n";
}
}
}
}
}
sub find_messages
{
my ( $archive ) = @_;
my $messages = {};
my @buckets_array = glob "$archive/*";
foreach my $abucket (@buckets_array) {
$abucket =~ s/.*\/(.*)$/$1/;
print "saved bucket $abucket opening\n" if ($debug);
my @dir_messages;
my $subdirs = 1;
my $locations = 0;
#$messages->{messages}{$abucket} = {};
my @subdirectories = glob "$archive/$abucket/*";
foreach my $dir (@subdirectories) {
$dir =~ s/.*\/(.*)$/$1/;
print "$abucket subdirectory $dir opening..." if ($debug);
if ( opendir(DIR,"$archive/$abucket/$dir") ) {
print "(open):\n" if ($debug);
$messages->{buckets}{$abucket}{subdirs}{$dir} = {};
$messages->{locations}[$locations] = "$archive/$abucket/$dir";
$locations++;
$subdirs = 1;
closedir(DIR);
foreach my $message ( glob( "$archive/$abucket/$dir/*.msg" ) ) {
my $shortname = $message;
$shortname =~ s/.*\/(.*)$/$1/;
$messages->{messages}{$shortname}{bucket} = $abucket;
$messages->{messages}{$shortname}{long} = $message;
$messages->{buckets}{$abucket}{messages}{$shortname} = $abucket;
$messages->{buckets}{$abucket}{subdirs}{$dir}{$shortname} = $abucket;
}
} else {
print ":" if ($debug);
$subdirs = 0;
last;
}
}
if ($subdirs == 0) {
print "$abucket root dir opening messages:\n" if $debug;
foreach my $message ( glob( "$archive/$abucket/*.msg") ) {
my $shortname = $message;
$shortname =~ s/.*\/(.*)$/$1/;
$messages->{messages}{$shortname}{bucket} = $abucket;
$messages->{messages}{$shortname}{long} = $message;
$messages->{buckets}{$abucket}{messages}{$shortname} = $abucket;
}
$messages->{locations}[$locations] = "$archive/$abucket";
$locations++;
}
}
return $messages;
}
sub flush_bayes
{
my ($self) = @_;
# Set this to 1 to get scores for individual words in message detail
$self->{wordscores__} = 0;
# Just our hostname
$self->{hostname__} = '';
# Matrix of buckets, words and the word counts
$self->{matrix__} = {};
# Total number of words in each bucket
$self->{total__} = {};
# Total number of unique words in each bucket
$self->{unique__} = {};
# Total number of words in all buckets
$self->{full_total__} = 0;
# Used to mangle the corpus when loaded
$self->{mangler__} = new Classifier::WordMangle;
# Used to parse mail messages
$self->{parser__} = new Classifier::MailParse;
# Colors assigned to each bucket
$self->{colors__} = {};
# The possible colors for buckets
$self->{possible_colors__} = [ 'red', 'green', 'blue', 'brown', 'orange', 'purple', 'magenta', 'gray', 'plum', 'silver',
'pink', 'lightgreen', 'lightblue', 'lightcyan', 'lightcoral', 'lightsalmon', 'lightgrey', 'darkorange', 'darkcyan', 'feldspar' ];
# Precomputed per bucket probabilities
$self->{bucket_start__} = {};
# A very unlikely word
$self->{not_likely__} = 0;
# The expected corpus version
$self->{corpus_version__} = 1;
# Per bucket parameters
$self->{parameters__} = {};
# The magnets that cause attraction to certain buckets
$self->{magnets__} = {};
$self->{magnet_count__} = 0;
# The unclassified cutoff probability
$self->{unclassified__} = 0.5;
# Used to tell the caller whether a magnet was used in the last
# mail classification
$self->{magnet_used__} = 0;
$self->{magnet_detail__} = '';
}
my $self = {};
# main
if ( @ARGV[0] ne "-usage")
{
my $b = new Classifier::Bayes;
my $c = new POPFile::Configuration;
$b->configuration( $c );
$c->configuration( $c );
$c->initialize();
$b->initialize();
initialize( $c );
$c->load_configuration();
$c->parse_command_line();
$b->{debug} = 0;
$b->{parser__}->{debug} = 0;
my $archive = $c->parameter("html_archive_dir");
#load the messages
$self->{messages} = find_messages($archive);
foreach my $exclude_fraction (0 .. ( $c->parameter("xcnt") - 1) ) {
print "excluding grouping $exclude_fraction\n" if $debug;
# Clean up our bayes object
flush_bayes($b);
$b->{unclassified__} = log($c->parameter("bayes_unclassified_probability") || 0.5);
# test with or without stop-words
if ( $c->parameter("stopwords") eq 0 ) {
$b->{parser__}->{mangle__}->{stop__} = {};
$b->{mangler__}->{stop__} = {};
}
# build a bucket skeleton
foreach my $abucket ( keys %{ $self->{messages}->{buckets} } ) {
$b->{total__}{$abucket} = 0;
$b->{colors__}{$abucket} = "black";
if ($abucket ne 'unclassified') {
foreach my $current_fraction ( sort keys( %{ $self->{messages}->{buckets}{$abucket}{subdirs}} )) {
# Do the hippy-hippy corpus-building shake
if ($current_fraction ne $exclude_fraction) {
print "training on $abucket/$current_fraction\n" if ($debug);
#my $dir = $self->{messages}->{messages}{each( $self->{messages}->{buckets}{$abucket}{subdirs}{$current_fraction}) }{long};
#$dir =~ s/(.*)\/[^/]*$/$1/;
#print "$dir:\n";
#reclassify_folder($b,$dir,$abucket);
foreach my $amessage ( keys %{$self->{messages}->{buckets}{$abucket}{subdirs}{$current_fraction}} ) {
$b->{parser__}->parse_stream($self->{messages}->{messages}{$amessage}{long});
reclassify_file($b,$abucket);
}
}
}
}
}
# classify all messages in the selected bunch
my $class;
my $total = 0;
my $errors = 0;
foreach my $abucket ( keys %{ $self->{messages}->{buckets} } ) {
print "classifying messages in $abucket/$exclude_fraction\n" if $debug;
foreach my $amessage ( keys %{$self->{messages}->{buckets}{$abucket}{subdirs}{$exclude_fraction}} ) {
$class = $b->classify_file($self->{messages}->{messages}{$amessage}{long});
if ($class ne $abucket) {
$errors++;
print "$class ne $abucket\n" if $debug;
}
$total++;
}
}
my $accuracy = (($total - $errors) / $total) * 100;
print "batch $exclude_fraction: $errors out of $total wrong. $accuracy% accurate\n";
$exclude_fraction++;
}
} else {
print "xval.pl - perform cross-validation tests on archived messages\n\n";
print "Usage: traintest.pl [-usage] [-parameter value [-parameter value] ...]\n";
print " -usage: Displays this screen\n";
print " Other Parameters Use\n";
print " -archive_dir: Location to seek an archive\n";
print " -stopwords: Use stop-words, defaults to $DEFAULT_STOP\n";
print " -dump: Outputs accumulated corpus, defaults to $DEFAULT_DUMP\n";
print " -corpus_out: Location to save output corpus, defaults to $DEFAULT_CORPUS\n";
}
|