Thread: [popfile-commit] engine/Classifier Bayes.pm,1.189,1.190

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv27765/Classifier

Modified Files:
	Bayes.pm 
Log Message:
PORT TO STORE CORPUS IN BERKELEYDB DATABASES

Bayes.pm:

The $self->{matrix__} hash is now a collection of tied hashes
to BerkeleyDB databases named table.db in each of the corpus
bucket subdirectories.  The set_value_ and get_value_ accessors
have been modified to access the database.  load_word_matrix_
and load_bucket_ now load the bucket information from the database
in concurrent mode.

prefork, forked and postfork handling closing and opening database
connections around forks to ensure that there are no threading 
problems with the database.

close_database__ can be called to clean up the connection to the
database at any time.

Many API functions have been modified internally to use the new
structure.  The external APIs have not changed.  get_bucket_word_list
is currently not implemented.

load_bucket_ does automatic upgrade from the old flat file style
of corpus to the database.

HTML.pm:

Added a note that since get_bucket_word_list isn't working it is
not possible to view the words in a bucket.

Module.pm:

Added description and base implementation of the new postfork()
method that is called on all modules when a fork has occurred and
in the parent process.   This is the parent equivalent of forked().

Loader.pm:

The forker is modified to call postfork() in the parent process
after a successful fork.

Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.189
retrieving revision 1.190
diff -C2 -d -r1.189 -r1.190
*** Bayes.pm	9 Sep 2003 00:28:19 -0000	1.189
--- Bayes.pm	10 Sep 2003 03:54:14 -0000	1.190
***************
*** 37,45 ****
--- 37,53 ----
  # This is used to get the hostname of the current machine
  # in a cross platform way
+ 
  use Sys::Hostname;

  # A handy variable containing the value of an EOL for networks
+ 
  my $eol = "\015\012";

+ # The corpus is stored in BerkeleyDB hashes called table.db in each
+ # of the corpus/* subdirectories.  The db files are tied to Perl
+ # hashes for simple access
+ 
+ use BerkeleyDB;
+ 
  #----------------------------------------------------------------------------
  # new
***************
*** 60,69 ****
      # Matrix of buckets, words and the word counts
      $self->{matrix__}            = {};
! 
!     # Total number of words in each bucket
!     $self->{total__}             = {};
! 
!     # Total number of unique words in each bucket
!     $self->{unique__}            = {};

      # Total number of words in all buckets
--- 68,72 ----
      # Matrix of buckets, words and the word counts
      $self->{matrix__}            = {};
!     $self->{db_env__}            = 0;

      # Total number of words in all buckets
***************
*** 122,125 ****
--- 125,175 ----
  # ---------------------------------------------------------------------------------------------
  #
+ # prefork
+ #
+ # POPFile is about to fork, because the BerkeleyDB interface doesn't support multiple
+ # threads accessing the database we will get a nasty failure if the database is tied to
+ # the hashes when the fork occurs (actually when the child exits).  So here we untie from
+ # the database
+ #
+ # ---------------------------------------------------------------------------------------------
+ sub prefork
+ {
+     my ( $self ) = @_;
+ 
+     $self->close_database__();
+ }
+ 
+ # ---------------------------------------------------------------------------------------------
+ #
+ # forked
+ #
+ # This is called inside a child process that has just forked, since the child needs access
+ # to the database we reopen it
+ #
+ # ---------------------------------------------------------------------------------------------
+ sub forked
+ {
+     my ( $self ) = @_;
+ 
+     $self->load_word_matrix_();
+ }
+ 
+ # ---------------------------------------------------------------------------------------------
+ #
+ # forked
+ #
+ # This is called inside the parent process that has just forked, since the parent needs access
+ # to the database we reopen it
+ #
+ # ---------------------------------------------------------------------------------------------
+ sub postfork
+ {
+     my ( $self ) = @_;
+ 
+     $self->load_word_matrix_();
+ }
+ 
+ # ---------------------------------------------------------------------------------------------
+ #
  # initialize
  #
***************
*** 190,193 ****
--- 240,244 ----

      $self->write_parameters();
+     $self->close_database__();
  }

***************
*** 222,226 ****
      my ($self) = @_;

!     for my $bucket (keys %{$self->{total__}})  {
          open PARAMS, '>' . $self->config_( 'corpus' ) . "/$bucket/params";
          for my $param (keys %{$self->{parameters__}{$bucket}}) {
--- 273,277 ----
      my ($self) = @_;

!     for my $bucket (keys %{$self->{matrix__}})  {
          open PARAMS, '>' . $self->config_( 'corpus' ) . "/$bucket/params";
          for my $param (keys %{$self->{parameters__}{$bucket}}) {
***************
*** 233,236 ****
--- 284,308 ----
  # ---------------------------------------------------------------------------------------------
  #
+ # close_database__
+ #
+ # Close all the database connections
+ #
+ # ---------------------------------------------------------------------------------------------
+ sub close_database__
+ {
+     my ( $self ) = @_;
+ 
+     for my $bucket (keys %{$self->{matrix__}})  {
+         untie %{$self->{matrix__}{$bucket}};
+         delete $self->{matrix__}{$bucket};
+     }
+ 
+     if ( defined( $self->{db_env__} ) ) {
+         delete $self->{db_env__};
+     }
+ }
+ 
+ # ---------------------------------------------------------------------------------------------
+ #
  # get_color
  #
***************
*** 247,252 ****
      my $color = 'black';

!     for my $bucket (keys %{$self->{total__}}) {
!         my $prob = get_value_( $self, $bucket, $word);

          if ( $prob != 0 )  {
--- 319,324 ----
      my $color = 'black';

!     for my $bucket (keys %{$self->{matrix__}}) {
!         my $prob = get_value_( $self, $bucket, $word );

          if ( $prob != 0 )  {
***************
*** 263,301 ****
  # ---------------------------------------------------------------------------------------------
  #
! # Perl hashes are a memory hog.  The original implementation was a Perl hash for the word
! # matrix, but instead we use a a set of nested array and some regexps magic.
! #
! # The word paradise in the bucket spam will be found in the array element
! #   matrix{spam}[p] with an entry of the form "|paradise 1234|".
  #
! # TODO: replace the word matrix hash with Berkeley DB tie
  #
  # ---------------------------------------------------------------------------------------------
  sub get_value_
  {
!     my ($self, $bucket, $word) = @_;
!     $word =~ /^(.)/;
!     my $i = ord($1);

!     if ( defined($self->{matrix__}{$bucket}[$i]) ) {
!         if ( ( $self->{matrix__}{$bucket}[$i] =~ /\|\Q$word\E (\d+)\|/ ) != 0 )  {
!             my $newvalue = log($1/$self->{total__}{$bucket});
!             return $newvalue;
!         }
      }

!     return 0;
  }

  sub set_value_
  {
!     my ($self, $bucket, $word, $value) = @_;

!     if ( $word ne '' ) {
!         $word =~ /^(.)/;
!         my $i = ord($1);

!         $self->{matrix__}{$bucket}[$i] = '' if ( !defined($self->{matrix__}{$bucket}[$i]) );
!         $self->{matrix__}{$bucket}[$i] .= "|$word $value|" if ( ( $self->{matrix__}{$bucket}[$i] =~ s/\|\Q$word\E [\-\.\d]+\|/\|$word $value\|/ ) == 0 );
      }
  }
--- 335,413 ----
  # ---------------------------------------------------------------------------------------------
  #
! # get_value_
  #
! # Returns the value for a specific word in a bucket.  The word is converted to the log value
! # of the probability before return to get the raw value just hit the hash directly or call
! # get_base_value_
  #
  # ---------------------------------------------------------------------------------------------
  sub get_value_
  {
!     my ( $self, $bucket, $word ) = @_;

!     my $value = $self->{matrix__}{$bucket}{$word};
! 
!     if ( defined( $value ) ) {
!         my $total = $self->get_bucket_word_count( $bucket );
!         return log( $value ) - log( $total );
!     } else {
!         return 0;
      }
+ }

! sub get_base_value_
! {
!     my ( $self, $bucket, $word ) = @_;
! 
!     my $value = $self->{matrix__}{$bucket}{$word};
! 
!     if ( defined( $value ) ) {
!         return $value;
!     } else {
!         return 0;
!     }
  }

+ # ---------------------------------------------------------------------------------------------
+ #
+ # set_value_
+ #
+ # Sets the value for a word in a bucket and updates the total word counts for the bucket
+ # and globally
+ #
+ # ---------------------------------------------------------------------------------------------
  sub set_value_
  {
!     my ( $self, $bucket, $word, $value ) = @_;

!     # If there's an existing value then remove it and keep the total up to date
!     # then add the new value, this is a little complicated but by keeping the
!     # total in a value in the database it avoids us doing any sort of query
!     # or full table scan

!     my $oldvalue = $self->{matrix__}{$bucket}{$word};
! 
!     if ( !defined( $oldvalue ) ) {
!         $oldvalue = 0;
!         $self->{matrix__}{$bucket}{$word} = $oldvalue;
!         if ( defined( $self->{matrix__}{$bucket}{__POPFILE__UNIQUE__} ) ) {
!             $self->{matrix__}{$bucket}{__POPFILE__UNIQUE__} += 1;
! 	} else {
!             $self->{matrix__}{$bucket}{__POPFILE__UNIQUE__} = 1;
!         }
!     }
! 
!     my $total = $self->get_bucket_word_count( $bucket );
! 
!     $total                                         -= $oldvalue;
!     $self->{full_total__}                          -= $oldvalue;
!     $self->{matrix__}{$bucket}{$word}               = $value;
!     $total                                         += $value;
!     $self->{matrix__}{$bucket}{__POPFILE__TOTAL__}  = $total;
!     $self->{full_total__}                          += $value;
! 
!     if ( $self->{matrix__}{$bucket}{$word} <= 0 ) {
!         $self->{matrix__}{$bucket}{__POPFILE__UNIQUE__} -= 1;
!         delete $self->{matrix__}{$bucket}{$word};
      }
  }
***************
*** 333,344 ****

      if ( $self->{full_total__} > 0 )  {

!         # ln(10) =~ 2.30258509299404568401799145468436
! 
!         $self->{not_likely__} = -log( $self->{full_total__} ) - 2.30258509299404568401799145468436;

!         foreach my $bucket (keys %{$self->{total__}}) {
!             if ( $self->{total__}{$bucket} != 0 ) {
!                 $self->{bucket_start__}{$bucket} = log( $self->{total__}{$bucket} / $self->{full_total__} );
              } else {
                  $self->{bucket_start__}{$bucket} = 0;
--- 445,455 ----

      if ( $self->{full_total__} > 0 )  {
+         $self->{not_likely__} = -log( $self->{full_total__} ) - log(10);

!         foreach my $bucket (keys %{$self->{matrix__}}) {
!             my $total = $self->get_bucket_word_count( $bucket );

!             if ( $total != 0 ) {
!                 $self->{bucket_start__}{$bucket} = log( $total ) - log( $self->{full_total__} );
              } else {
                  $self->{bucket_start__}{$bucket} = 0;
***************
*** 360,365 ****
      my $c      = 0;

!     $self->{matrix__}       = {};
!     $self->{total__}        = {};
      $self->{magnets__}      = {};
      $self->{full_total__}   = 0;
--- 471,477 ----
      my $c      = 0;

!     $self->close_database__();
!     $self->{db_env__} = new BerkeleyDB::Env -Flags => DB_INIT_CDB;
! 
      $self->{magnets__}      = {};
      $self->{full_total__}   = 0;
***************
*** 393,399 ****

          $self->load_bucket_( $bucket );
          $bucket =~ /([[:alpha:]0-9-_]+)$/;
          $bucket =  $1;
-         $self->{full_total__} += $self->{total__}{$bucket};

          if ( $color eq '' )  {
--- 505,511 ----

          $self->load_bucket_( $bucket );
+ 
          $bucket =~ /([[:alpha:]0-9-_]+)$/;
          $bucket =  $1;

          if ( $color eq '' )  {
***************
*** 445,451 ****
      $self->{parameters__}{$bucket}{quarantine} = 0;

-     $self->{total__}{$bucket}  = 0;
-     $self->{unique__}{$bucket} = 0;
-     $self->{matrix__}{$bucket} = ();
      $self->{magnets__}{$bucket} = {};

--- 557,560 ----
***************
*** 502,541 ****
      }

!     # Each line in the word table is a word and a count
!     $self->{total__}{$bucket} = 0;

!     if ( open WORDS, '<' . $self->config_( 'corpus' ) . "/$bucket/table" )  {
!         my $first = <WORDS>;
!         if ( defined( $first ) && ( $first =~ s/^__CORPUS__ __VERSION__ (\d+)// ) ) {
!             if ( $1 != $self->{corpus_version__} )  {
!                 print STDERR "Incompatible corpus version in $bucket\n";
!                 close WORDS;
!                 return 0;
!             }
!         } else {
!             close WORDS;
!             return 0;
!         }

!         while ( <WORDS> ) {

!             s/[\r\n]//g;

!             if ( /^([^\s]+) (\d+)$/ ) {
!                 my $word  = $1;
!                 my $value = $2;
!                 if ( $value > 0 )  {
!                     $self->{total__}{$bucket}        += $value;
!                     $self->{unique__}{$bucket}       += 1;
!                     set_value_( $self, $bucket, $word, $value );
                  }
              } else {
!                 $self->log_( "Found entry in corpus for $bucket that looks wrong: \"$_\" (ignoring)" );
!             }
!         }

!         close WORDS;
      }

      $self->calculate_magnet_count__();

--- 611,678 ----
      }

!     # This code performs two tasks:
!     #
!     # If there is an existing table.db in the bucket directory then simply
!     # tie it to the appropriate hash.
!     #
!     # If there is no existing table but there is a table file (the old style
!     # flat file used by POPFile for corpus storage) then create the new
!     # tied hash from it thus performing an automatic upgrade.

!     tie %{$self->{matrix__}{$bucket}}, "BerkeleyDB::Hash",
!             -Filename => $self->config_( 'corpus' ) . "/$bucket/table.db",
!             -Flags    => DB_CREATE;

!     if ( !defined( $self->get_bucket_word_count( $bucket ) ) ) {
!         $self->{matrix__}{$bucket}{__POPFILE__TOTAL__} = 0;
!     }

!     if ( -e $self->config_( 'corpus' ) . "/$bucket/table" ) {
!         $self->log_( "Performing automatic upgrade of $bucket corpus from flat file to BerkeleyDB" );

!         my $ft = $self->{full_total__};
! 
!         if ( open WORDS, '<' . $self->config_( 'corpus' ) . "/$bucket/table" )  {
! 
! 	    print "\nUpgrading bucket $bucket...";
!             flush STDOUT;
!             my $wc = 1;
! 
!             my $first = <WORDS>;
!             if ( defined( $first ) && ( $first =~ s/^__CORPUS__ __VERSION__ (\d+)// ) ) {
!                 if ( $1 != $self->{corpus_version__} )  {
!                     print STDERR "Incompatible corpus version in $bucket\n";
!                     close WORDS;
!                 } else {
!                     while ( <WORDS> ) {
! 		        if ( $wc % 100 == 0 ) {
!                             print "$wc ";
!                             flush STDOUT;
! 		        }
!                         $wc += 1;
!                         s/[\r\n]//g;
! 
!                         if ( /^([^\s]+) (\d+)$/ ) {
!                             $self->set_value_( $bucket, $1, $2 );
!                         } else {
!                             $self->log_( "Found entry in corpus for $bucket that looks wrong: \"$_\" (ignoring)" );
!                         }
! 		    }
                  }
+ 
+                 print "(completed $wc words)";
+                 close WORDS;
              } else {
!                 close WORDS;
! 	    }

!             unlink( $self->config_( 'corpus' ) . "/$bucket/table" );
! 
!             $self->{full_total__} = $ft;
!         }
      }

+     $self->{full_total__} += $self->get_bucket_word_count( $bucket );
+ 
      $self->calculate_magnet_count__();

***************
*** 557,561 ****
      $self->{magnet_count__} = 0;

!     for my $bucket (keys %{$self->{total__}}) {
          for my $type (keys %{$self->{magnets__}{$bucket}})  {
              for my $from (keys %{$self->{magnets__}{$bucket}{$type}})  {
--- 694,698 ----
      $self->{magnet_count__} = 0;

!     for my $bucket (keys %{$self->{matrix__}}) {
          for my $type (keys %{$self->{magnets__}{$bucket}})  {
              for my $from (keys %{$self->{magnets__}{$bucket}{$type}})  {
***************
*** 577,581 ****
      my ($self) = @_;

!     for my $bucket (keys %{$self->{total__}}) {
          open MAGNET, '>' . $self->config_( 'corpus' ). "/$bucket/magnets";

--- 714,718 ----
      my ($self) = @_;

!     for my $bucket (keys %{$self->{matrix__}}) {
          open MAGNET, '>' . $self->config_( 'corpus' ). "/$bucket/magnets";

***************
*** 602,606 ****
  #
  # ---------------------------------------------------------------------------------------------
- 
  sub chi2
  {
--- 739,742 ----
***************
*** 646,650 ****
      # Get the list of buckets

!     my @buckets = keys %{$self->{total__}};

      for my $bucket (sort keys %{$self->{magnets__}})  {
--- 782,786 ----
      # Get the list of buckets

!     my @buckets = keys %{$self->{matrix__}};

      for my $bucket (sort keys %{$self->{magnets__}})  {
***************
*** 707,711 ****

          foreach my $bucket (@buckets) {
!             my $probability = get_value_( $self, $bucket, $word );

              $matchcount{$bucket} += $self->{parser__}{words__}{$word} if ($probability != 0);
--- 843,847 ----

          foreach my $bucket (@buckets) {
!             my $probability = $self->get_value_( $bucket, $word );

              $matchcount{$bucket} += $self->{parser__}{words__}{$word} if ($probability != 0);
***************
*** 841,845 ****
                  foreach my $ix (0..($#buckets > 7? 7: $#buckets)) {
                      my $bucket = $ranking[$ix];
!                     my $probability  = get_value_( $self, $bucket, $word );
                      my $color        = 'black';

--- 977,981 ----
                  foreach my $ix (0..($#buckets > 7? 7: $#buckets)) {
                      my $bucket = $ranking[$ix];
!                     my $probability  = $self->get_value_( $bucket, $word );
                      my $color        = 'black';

***************
*** 1284,1288 ****
      my ( $self ) = @_;

!     return sort keys %{$self->{total__}};
  }

--- 1420,1424 ----
      my ( $self ) = @_;

!     return sort keys %{$self->{matrix__}};
  }

***************
*** 1300,1304 ****
      my ( $self, $bucket ) = @_;

!     return $self->{total__}{$bucket};
  }

--- 1436,1440 ----
      my ( $self, $bucket ) = @_;

!     return $self->{matrix__}{$bucket}{__POPFILE__TOTAL__};
  }

***************
*** 1320,1330 ****

      if ( $self->get_bucket_word_count( $bucket ) > 0 ) {
!         my @entries = @{$self->{matrix__}{$bucket}};

!         for my $i (0..$#entries) {
!             if ( defined( $entries[$i] ) && ( $entries[$i] ne '' ) ) {
!                 push @result, ($entries[$i]);
!             }
!         }
      }

--- 1456,1466 ----

      if ( $self->get_bucket_word_count( $bucket ) > 0 ) {
! # TODO        my @entries = @{$self->{matrix__}{$bucket}};

! # TODO        for my $i (0..$#entries) {
! # TODO            if ( defined( $entries[$i] ) && ( $entries[$i] ne '' ) ) {
! # TODO                push @result, ($entries[$i]);
! # TODO            }
! # TODO        }
      }

***************
*** 1360,1370 ****
      my ( $self, $bucket, $word ) = @_;

!     my $value = $self->get_value_( $bucket, $word );
! 
!     if ( $value == 0 ) {
!          return 0;
!     } else {
!         return int( exp( $value ) * $self->get_bucket_word_count( $bucket ) + 0.5 );
!     }
  }

--- 1496,1500 ----
      my ( $self, $bucket, $word ) = @_;

!     return $self->get_base_value_( $bucket, $word );
  }

***************
*** 1382,1386 ****
      my ( $self, $bucket ) = @_;

!     return $self->{unique__}{$bucket};
  }

--- 1512,1516 ----
      my ( $self, $bucket ) = @_;

!     return $self->{matrix__}{$bucket}{__POPFILE__UNIQUE__};
  }

***************
*** 1498,1505 ****
      mkdir( $self->config_( 'corpus' ) . "/$bucket" );

!     if ( open NEW, '>' . $self->config_( 'corpus' ) . "/$bucket/table" ) {
!         print NEW "__CORPUS__ __VERSION__ 1\n";
!         close NEW;
!     }

      $self->load_word_matrix_();
--- 1628,1634 ----
      mkdir( $self->config_( 'corpus' ) . "/$bucket" );

!     tie %{$self->{matrix__}{$bucket}}, "BerkeleyDB::Hash",
!             -Filename => $self->config_( 'corpus' ) . "/$bucket/table.db",
!             -Flags    => DB_CREATE;

      $self->load_word_matrix_();
***************
*** 1519,1523 ****
      my ( $self, $bucket ) = @_;

!     if ( !defined( $self->{total__}{$bucket} ) ) {
          return 0;
      }
--- 1648,1652 ----
      my ( $self, $bucket ) = @_;

!     if ( !defined( $self->{matrix__}{$bucket} ) ) {
          return 0;
      }
***************
*** 1525,1529 ****
      my $bucket_directory = $self->config_( 'corpus' ) . "/$bucket";

!     unlink( "$bucket_directory/table" );
      unlink( "$bucket_directory/color" );
      unlink( "$bucket_directory/params" );
--- 1654,1660 ----
      my $bucket_directory = $self->config_( 'corpus' ) . "/$bucket";

!     $self->close_database__();
! 
!     unlink( "$bucket_directory/table.db" );
      unlink( "$bucket_directory/color" );
      unlink( "$bucket_directory/params" );
***************
*** 1550,1557 ****
      my ( $self, $old_bucket, $new_bucket ) = @_;

!     if ( !defined( $self->{total__}{$old_bucket} ) ) {
          return 0;
      }

      rename($self->config_( 'corpus' ) . "/$old_bucket" , $self->config_( 'corpus' ) . "/$new_bucket");

--- 1681,1690 ----
      my ( $self, $old_bucket, $new_bucket ) = @_;

!     if ( !defined( $self->{matrix__}{$old_bucket} ) ) {
          return 0;
      }

+     $self->close_database__();
+ 
      rename($self->config_( 'corpus' ) . "/$old_bucket" , $self->config_( 'corpus' ) . "/$new_bucket");

***************
*** 1578,1629 ****
      # when making a new bucket.

!     if ( !defined( $self->{total__}{$bucket} ) ) {
          return 0;
      }

-     my %words;
- 
-     if ( open WORDS, '<' . $self->config_( 'corpus' ) . "/$bucket/table" )  {
-         while (<WORDS>) {
-             if ( /__CORPUS__ __VERSION__ (\d+)/ ) {
-                 if ( $1 != $self->{corpus_version__} )  {
-                     print STDERR "Incompatible corpus version in $bucket\n";
-                     close WORDS;
-                     return 0;
-                 }
- 
-                 next;
-             }
- 
-             s/[\r\n]//g;
- 
-             if ( /^([^\s]+) (\d+)$/ ) {
-                 my $word  = $1;
-                 my $value = $2;
-                 if ( $value > 0 )  {
-                     $words{$word} = $value;
-                 }
-             }
-         }
- 
-         close WORDS;
-     }
- 
      foreach my $file (@files) {
          $self->{parser__}->parse_file( $file );

          foreach my $word (keys %{$self->{parser__}->{words__}}) {
!             $words{$word} += $self->{parser__}->{words__}{$word};
!         }
!     }
! 
!     if ( open WORDS, '>' . $self->config_( 'corpus' ) . "/$bucket/table" ) {
!         print WORDS "__CORPUS__ __VERSION__ 1\n";
!         foreach my $word (sort keys %words) {
!             if ( $words{$word} != 0 ) {
!                 print WORDS "$word $words{$word}\n";
!             }
          }
-         close WORDS;
      }

--- 1711,1724 ----
      # when making a new bucket.

!     if ( !defined( $self->{matrix__}{$bucket} ) ) {
          return 0;
      }

      foreach my $file (@files) {
          $self->{parser__}->parse_file( $file );

          foreach my $word (keys %{$self->{parser__}->{words__}}) {
!             $self->set_value_( $bucket, $word, $self->{parser__}->{words__}{$word} + $self->get_base_value_( $bucket, $word ) );
          }
      }

***************
*** 1667,1716 ****
      # when making a new bucket.

!     if ( !defined( $self->{total__}{$bucket} ) ) {
          return 0;
      }

-     my %words;
- 
-     if ( open WORDS, '<' . $self->config_( 'corpus' ) . "/$bucket/table" )  {
-         while (<WORDS>) {
-             if ( /__CORPUS__ __VERSION__ (\d+)/ ) {
-                 if ( $1 != $self->{corpus_version__} )  {
-                     print STDERR "Incompatible corpus version in $bucket\n";
-                     close WORDS;
-                     return 0;
-                 }
- 
-                 next;
-             }
- 
-             s/[\r\n]//g;
- 
-             if ( /^([^\s]+) (\d+)$/ ) {
-                 my $word  = $1;
-                 my $value = $2;
-                 if ( $value > 0 )  {
-                     $words{$word} = $value;
-                 }
-             }
-         }
- 
-         close WORDS;
-     }
- 
      $self->{parser__}->parse_file( $file );

      foreach my $word (keys %{$self->{parser__}->{words__}}) {
!         $words{$word} -= $self->{parser__}->{words__}{$word};
!     }
! 
!     if ( open WORDS, '>' . $self->config_( 'corpus' ) . "/$bucket/table" ) {
!         print WORDS "__CORPUS__ __VERSION__ 1\n";
!         foreach my $word (sort keys %words) {
!             if ( $words{$word} != 0 ) {
!                 print WORDS "$word $words{$word}\n";
!             }
!         }
!         close WORDS;
      }

--- 1762,1773 ----
      # when making a new bucket.

!     if ( !defined( $self->{matrix__}{$bucket} ) ) {
          return 0;
      }

      $self->{parser__}->parse_file( $file );

      foreach my $word (keys %{$self->{parser__}->{words__}}) {
!         $self->set_value_( $bucket, $word, $self->get_base_value_( $bucket, $word ) - $self->{parser__}->{words__}{$word} );
      }

***************
*** 1827,1831 ****
      my $bucket_directory = $self->config_( 'corpus' ) . "/$bucket";

!     unlink( "$bucket_directory/table" );

      $self->load_word_matrix_();
--- 1884,1889 ----
      my $bucket_directory = $self->config_( 'corpus' ) . "/$bucket";

!     untie %{$self->{matrix__}{$bucket}};
!     unlink( "$bucket_directory/table.db" );

      $self->load_word_matrix_();