Thread: [popfile-commit] engine/Classifier Bayes.pm,1.199,1.200 MailParse.pm,1.161,1.162

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv21997/Classifier

Modified Files:
	Bayes.pm MailParse.pm 
Log Message:
Merge patch that makes POPFile work well with the Japanese language

Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.199
retrieving revision 1.200
diff -C2 -d -r1.199 -r1.200
*** Bayes.pm	22 Sep 2003 13:06:39 -0000	1.199
--- Bayes.pm	22 Sep 2003 13:27:21 -0000	1.200
***************
*** 783,791 ****
      my $msg_total = 0;

      $self->{magnet_used__}   = 0;
      $self->{magnet_detail__} = '';

      if ( defined( $file ) ) {
!         $self->{parser__}->parse_file( $file );
      }

--- 783,793 ----
      my $msg_total = 0;

+     # Pass language parameter to parse_file()
+ 
      $self->{magnet_used__}   = 0;
      $self->{magnet_detail__} = '';

      if ( defined( $file ) ) {
!         $self->{parser__}->parse_file( $file, $self->module_config_( 'html', 'language' ) );
      }

***************
*** 809,825 ****
              $noattype =~ s/\$/__POPFILE_DOLLAR__/g;

!             for my $magnet (sort keys %{$self->{magnets__}{$bucket}{$type}}) {
!                 my $regex;

!                 $regex = $magnet;
!                 $regex =~ s/@/__POPFILE_AT__/g;
!                 $regex =~ s/\$/__POPFILE_DOLLAR__/g;

!                 if ( $noattype =~ m/\Q$regex\E/i ) {
!                     $self->{scores__}        = '';
!                     $self->{magnet_used__}   = 1;
!                     $self->{magnet_detail__} = "$type: $magnet";

!                     return $bucket;
                  }
              }
--- 811,850 ----
              $noattype =~ s/\$/__POPFILE_DOLLAR__/g;

!             # In Japanese mode, disable locale.
!             # Sorting Japanese with "use locale" is memory and time consuming,
!             # and may cause perl crash.

!             if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
!                 no locale;
!                 for my $magnet (sort keys %{$self->{magnets__}{$bucket}{$type}}) {
!                     my $regex;

!                     $regex = $magnet;
!                     $regex =~ s/@/__POPFILE_AT__/g;
!                     $regex =~ s/\$/__POPFILE_DOLLAR__/g;

!                     if ( $noattype =~ m/\Q$regex\E/i ) {
!                         $self->{scores__}        = '';
!                         $self->{magnet_used__}   = 1;
!                         $self->{magnet_detail__} = "$type: $magnet";
! 
!                         return $bucket;
!                     }
!                 }
!             } else {
!                 for my $magnet (sort keys %{$self->{magnets__}{$bucket}{$type}}) {
!                     my $regex;
! 
!                     $regex = $magnet;
!                     $regex =~ s/@/__POPFILE_AT__/g;
!                     $regex =~ s/\$/__POPFILE_DOLLAR__/g;
! 
!                     if ( $noattype =~ m/\Q$regex\E/i ) {
!                         $self->{scores__}        = '';
!                         $self->{magnet_used__}   = 1;
!                         $self->{magnet_detail__} = "$type: $magnet";
! 
!                         return $bucket;
!                     }
                  }
              }
***************
*** 855,859 ****
      # the values displayed in scores__; it has no effect on the classification
      # process.
!   
      my $correction = 0;

--- 880,884 ----
      # the values displayed in scores__; it has no effect on the classification
      # process.
! 
      my $correction = 0;

***************
*** 874,878 ****
              $score{$bucket} += ( $probability * $self->{parser__}{words__}{$word} );
          }
!   
          if ($wmax > $self->{not_likely__}) {
              $correction += $self->{not_likely__} * $self->{parser__}{words__}{$word};
--- 899,903 ----
              $score{$bucket} += ( $probability * $self->{parser__}{words__}{$word} );
          }
! 
          if ($wmax > $self->{not_likely__}) {
              $correction += $self->{not_likely__} * $self->{parser__}{words__}{$word};
***************
*** 915,922 ****
      }

- 
      $class = 'unsure' if ( $certainty < 0.4 );

- 
      # Compute the total of all the scores to generate the normalized scores and probability
      # estimate.  $total is always 1 after the first loop iteration, so any additional term
--- 940,945 ----
***************
*** 976,980 ****

          $self->{scores__} .= "<a name=\"scores\">";
!         
          # If there are fewer than 2 buckets, there is no "verdict " to mention.
          if (@buckets > 1) {
--- 999,1003 ----

          $self->{scores__} .= "<a name=\"scores\">";
! 
          # If there are fewer than 2 buckets, there is no "verdict " to mention.
          if (@buckets > 1) {
***************
*** 1018,1026 ****

          $self->{scores__} .= "</table><hr>";
!  
          # We want a link to change the format here.  But only the UI knows how to build
          # that link.  So we just insert a comment which can be replaced by the UI.  There's
          # probably a better way.
!    
          $self->{scores__} .= "<!--format--><p>";
          $self->{scores__} .= "<table class=\"top20Words\">\n";
--- 1041,1049 ----

          $self->{scores__} .= "</table><hr>";
! 
          # We want a link to change the format here.  But only the UI knows how to build
          # that link.  So we just insert a comment which can be replaced by the UI.  There's
          # probably a better way.
! 
          $self->{scores__} .= "<!--format--><p>";
          $self->{scores__} .= "<table class=\"top20Words\">\n";
***************
*** 1036,1043 ****

          my %wordprobs;
!   
          # If the word matrix is supposed to show probabilities, compute them,
          # saving the results in %wordprobs.
!   
          if ( $self->{wmformat__} eq 'prob') {
              foreach my $word (keys %{$self->{parser__}->{words__}}) {
--- 1059,1066 ----

          my %wordprobs;
! 
          # If the word matrix is supposed to show probabilities, compute them,
          # saving the results in %wordprobs.
! 
          if ( $self->{wmformat__} eq 'prob') {
              foreach my $word (keys %{$self->{parser__}->{words__}}) {
***************
*** 1053,1057 ****
              }
          }
!    
          my @ranked_words;
          if ($self->{wmformat__} eq 'prob') {
--- 1076,1080 ----
              }
          }
! 
          my @ranked_words;
          if ($self->{wmformat__} eq 'prob') {
***************
*** 1400,1403 ****
--- 1423,1432 ----
      close TEMP;

+     # Parse Japanese mail message with Kakasi
+ 
+     if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
+         parse_with_kakasi( $self, $temp_file, $dcount, $mcount );
+     }
+ 
      # If we don't yet know the classification then stop the parser
      if ( $class eq '' ) {
***************
*** 1726,1730 ****
      $self->{parser__}->{color__} = 1;
      $self->{parser__}->{bayes__} = bless $self;
!     my $result = $self->{parser__}->parse_file( $file );
      $self->{parser__}->{color__} = 0;

--- 1755,1763 ----
      $self->{parser__}->{color__} = 1;
      $self->{parser__}->{bayes__} = bless $self;
! 
!     # Pass language parameter to parse_file()
! 
!     my $result = $self->{parser__}->parse_file( $file, $self->module_config_( 'html', 'language' ) );
! 
      $self->{parser__}->{color__} = 0;

***************
*** 1861,1869 ****
      }

      foreach my $file (@files) {
!         $self->{parser__}->parse_file( $file );

!         foreach my $word (keys %{$self->{parser__}->{words__}}) {
!             $self->set_value_( $bucket, $word, $self->{parser__}->{words__}{$word} + $self->get_base_value_( $bucket, $word ) );
          }
      }
--- 1894,1915 ----
      }

+     # Pass language parameter to parse_file()
+ 
      foreach my $file (@files) {
!         $self->{parser__}->parse_file( $file, $self->module_config_( 'html', 'language' ) );

!        # In Japanese mode, disable locale.
!        # Sorting Japanese with "use locale" is memory and time consuming,
!        # and may cause perl crash.
! 
!        if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
!             no locale;
!             foreach my $word (keys %{$self->{parser__}->{words__}}) {
!                 $self->set_value_( $bucket, $word, $self->{parser__}->{words__}{$word} + $self->get_base_value_( $bucket, $word ) );
!             }
!         } else {
!             foreach my $word (keys %{$self->{parser__}->{words__}}) {
!                 $self->set_value_( $bucket, $word, $self->{parser__}->{words__}{$word} + $self->get_base_value_( $bucket, $word ) );
!             }
          }
      }
***************
*** 1912,1919 ****
      }

!     $self->{parser__}->parse_file( $file );

!     foreach my $word (keys %{$self->{parser__}->{words__}}) {
!         $self->set_value_( $bucket, $word, $self->get_base_value_( $bucket, $word ) - $self->{parser__}->{words__}{$word} );
      }

--- 1958,1978 ----
      }

!     # Pass language parameter to parse_file()

!     $self->{parser__}->parse_file( $file, $self->module_config_( 'html', 'language' ) );
! 
!     # In Japanese mode, disable locale.
!     # Sorting Japanese with "use locale" is memory and time consuming,
!     # and may cause perl crash.
! 
!     if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
!         no locale;
!         foreach my $word (keys %{$self->{parser__}->{words__}}) {
!             $self->set_value_( $bucket, $word, $self->get_base_value_( $bucket, $word ) - $self->{parser__}->{words__}{$word} );
!         }
!     } else {
!         foreach my $word (keys %{$self->{parser__}->{words__}}) {
!             $self->set_value_( $bucket, $word, $self->get_base_value_( $bucket, $word ) - $self->{parser__}->{words__}{$word} );
!          }
      }

***************
*** 2067,2071 ****
      my ( $self, $bucket, $type ) = @_;

!     return sort keys %{$self->{magnets__}{$bucket}{$type}};
  }

--- 2126,2139 ----
      my ( $self, $bucket, $type ) = @_;

!     # In Japanese mode, disable locale.
!     # Sorting Japanese with "use locale" is memory and time consuming,
!     # and may cause perl crash.
! 
!     if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
!         no locale;
!         return sort keys %{$self->{magnets__}{$bucket}{$type}};
!     } else {
!         return sort keys %{$self->{magnets__}{$bucket}{$type}};
!     }
  }

***************
*** 2204,2212 ****
      return $self->{magnet_count__};
  }
!    
  sub wmformat
  {
      my ( $self, $value ) = @_;
!  
      $self->{wmformat__} = $value if (defined $value);
      return $self->{wmformat__};
--- 2272,2336 ----
      return $self->{magnet_count__};
  }
! 
! # ---------------------------------------------------------------------------------------------
! #
! # parse_with_kakasi
! #
! # Parse Japanese mail message with Kakasi
! #
! # Japanese needs to be parsed by language processing filter, "Kakasi"
! # before it is passed to Bayes classifier because words are not splitted
! # by spaces.
! #
! # $file           The file to parse
! #
! # ---------------------------------------------------------------------------------------------
! sub parse_with_kakasi
! {
!     my ( $self, $file, $dcount, $mcount ) = @_;
! 
!     # This is used for Japanese support
!     require Encode;
! 
!     # This is used to parse Japanese
!     require Text::Kakasi;
! 
!     my $temp_file  = $self->global_config_( 'msgdir' ) . "kakasi$dcount" . "=$mcount.msg";
!     print $temp_file;
! 
!     # Split Japanese email body into words using Kakasi Wakachigaki
!     # mode(-w is passed to Kakasi as argument). The most common charset of
!     # Japanese email is ISO-2022-JP, alias is jis, so -ijis and -ojis
!     # are passed to tell Kakasi the input charset and the output charset
!     # explicitly.
!     #
!     # After Kakasi processing, Encode::from_to is used to convert into UTF-8.
!     #
!     # Japanese email charset is assumed to be ISO-2022-JP. Needs to expand for
!     # other possible charset, such as Shift_JIS, EUC-JP, UTF-8.
! 
!     Text::Kakasi::getopt_argv("kakasi", "-w -ijis -ojis");
!     open KAKASI_IN, "<$file";
!     open KAKASI_OUT, ">$temp_file";
! 
!     while( <KAKASI_IN> ){
!         my $kakasi_out;
! 
! 	$kakasi_out = Text::Kakasi::do_kakasi($_);
!         Encode::from_to($kakasi_out, "iso-2022-jp", "euc-jp");
!         print KAKASI_OUT $kakasi_out;
!     }
! 
!     close KAKASI_OUT;
!     close KAKASI_IN;
!     Text::Kakasi::close_kanwadict();
!     unlink( $file );
!     rename( $temp_file, $file );
! }
! 
  sub wmformat
  {
      my ( $self, $value ) = @_;
! 
      $self->{wmformat__} = $value if (defined $value);
      return $self->{wmformat__};

Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.161
retrieving revision 1.162
diff -C2 -d -r1.161 -r1.162
*** MailParse.pm	19 Sep 2003 20:18:52 -0000	1.161
--- MailParse.pm	22 Sep 2003 13:27:21 -0000	1.162
***************
*** 32,35 ****
--- 32,56 ----
  use MIME::QuotedPrint;

+ # These are used for Japanese support
+ 
+ use Encode;
+ my $ascii = '[\x00-\x7F]'; # ASCII chars
+ my $two_bytes_euc_jp = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])'; # 2bytes EUC-JP chars
+ my $three_bytes_euc_jp = '(?:\x8F[\xA1-\xFE][\xA1-\xFE])'; # 3bytes EUC-JP chars
+ my $euc_jp = "(?:$ascii|$two_bytes_euc_jp|$three_bytes_euc_jp)"; # EUC-JP chars
+ 
+ # Symbols in EUC-JP chars which cannot be considered a part of words
+ my $symbol_row1_euc_jp = '(?:[\xA1][\xA1-\xBB\xBD-\xFE])';
+ my $symbol_row2_euc_jp = '(?:[\xA2][\xA1-\xFE])';
+ my $symbol_row8_euc_jp = '(?:[\xA8][\xA1-\xFE])';
+ my $symbol_euc_jp = "(?:$symbol_row1_euc_jp|$symbol_row2_euc_jp|$symbol_row8_euc_jp)";
+ 
+ # Cho-on kigou(symbol in Japanese), a special symbol which can appear in middle of words
+ my $cho_on_symbol = '(?:\xA1\xBC)';
+ 
+ # Non-symbol EUC-JP chars
+ my $non_symbol_two_bytes_euc_jp = '(?:[\x8E\xA3-\xA7\xB0-\xFE][\xA1-\xFE])'; 
+ my $non_symbol_euc_jp = "(?:$non_symbol_two_bytes_euc_jp|$three_bytes_euc_jp|$cho_on_symbol)";
+ 
  # HTML entity mapping to character codes, this maps things like &amp; to their corresponding
  # character code
***************
*** 173,176 ****
--- 194,201 ----
      $self->{html_arg__}     = '';
      $self->{in_headers__}   = 0;
+ 
+     # This is used for switching on/off language specific functionality
+     $self->{lang__} = '';
+ 
      $self->{first20__}      = '';

***************
*** 462,476 ****
              }

!             # Only care about words between 3 and 45 characters since short words like
!             # an, or, if are too common and the longest word in English (according to
!             # the OED) is pneumonoultramicroscopicsilicovolcanoconiosis

!             while ( $line =~ s/([[:alpha:]][[:alpha:]\']{1,44})([_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)// ) {
!                 if ( ( $self->{in_headers__} == 0 ) && ( $self->{first20count__} < 20 ) ) {
!                     $self->{first20count__} += 1;
!                     $self->{first20__} .= " $1";
                  }

!                 update_word($self,$1, $encoded, '', '[_\-,\.\"\'\)\?!:;\/ &\t\n\r]', $prefix) if (length $1 >= 3);
              }

--- 487,529 ----
              }

!             if ( $self->{lang__} eq 'Nihongo' ) {
!                 # In Japanese mode, non-symbol EUC-JP characters should be
!                 # matched.
!                 #
!                 # ^$euc_jp*? is added to avoid incorrect matching.
!                 # For example, EUC-JP char represented by code A4C8, should not
!                 # match the middle of two EUC-JP chars represented by CCA4 and
!                 # C8BE, the second byte of the first char and the first byte of
!                 # the second char.

!                 while ( $line =~ s/^$euc_jp*?(([A-Za-z]|$non_symbol_euc_jp)([A-Za-z\']|$non_symbol_euc_jp){1,44})([_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)//ox ) {
!                     if ( ( $self->{in_headers__} == 0 ) && ( $self->{first20count__} < 20 ) ) {
!                         $self->{first20count__} += 1;
!                         $self->{first20__} .= " $1";
!                     }
! 
!                     my $matched_word = $1;
! 
!                     # In Japanese, 2 characters words are common, so care about
!                     # words between 2 and 45 characters
! 
!                     if (((length $matched_word >= 3) && ($matched_word =~ /[A-Za-z]/)) || ((length $matched_word >= 2) && ($matched_word =~ /$non_symbol_euc_jp/))) {
!                         update_word($self, $matched_word, $encoded, '', '[_\-,\.\"\'\)\?!:;\/ &\t\n\r]'."|$symbol_euc_jp", $prefix);
!                     }
                  }

!             } else {
!                 # Only care about words between 3 and 45 characters since short words like
!                 # an, or, if are too common and the longest word in English (according to
!                 # the OED) is pneumonoultramicroscopicsilicovolcanoconiosis
! 
!                 while ( $line =~ s/([[:alpha:]][[:alpha:]\']{1,44})([_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)// ) {
!                     if ( ( $self->{in_headers__} == 0 ) && ( $self->{first20count__} < 20 ) ) {
!                         $self->{first20count__} += 1;
!                         $self->{first20__} .= " $1";
!                     }
! 
!                     update_word($self,$1, $encoded, '', '[_\-,\.\"\'\)\?!:;\/ &\t\n\r]', $prefix) if (length $1 >= 3);
!                 }
              }

***************
*** 1026,1034 ****
  #
  # $file     The file to open and parse
  #
  # ---------------------------------------------------------------------------------------------
  sub parse_file
  {
!     my ( $self, $file ) = @_;

      $self->start_parse();
--- 1079,1091 ----
  #
  # $file     The file to open and parse
+ # $lang     Pass in the current interface language for language specific parsing
  #
  # ---------------------------------------------------------------------------------------------
  sub parse_file
  {
!     # $lang is used for switching on/off language specific functionality
! 
!     my ( $self, $file, $lang ) = @_;
!     $self->{lang__} = $lang;

      $self->start_parse();
***************
*** 1385,1397 ****
      my $decode_it = '';

!     while ( $mystring =~ /=\?[\w-]+\?(B|Q)\?(.*?)\?=/ig ) {
!         if ($1 eq "B" || $1 eq "b") {
!             $decode_it = decode_base64( $2 );
              $mystring =~ s/=\?[\w-]+\?B\?(.*?)\?=/$decode_it/i;
          } else {
!             if ($1 eq "Q" || $1 eq "q") {
!                 $decode_it = $2;
                  $decode_it =~ s/\_/=20/g;
                  $decode_it = decode_qp( $decode_it );
                  $mystring =~ s/=\?[\w-]+\?Q\?(.*?)\?=/$decode_it/i;
              }
--- 1442,1466 ----
      my $decode_it = '';

!     while ( $mystring =~ /=\?([\w-]+)\?(B|Q)\?(.*?)\?=/ig ) {
!         if ($2 eq "B" || $2 eq "b") {
!             $decode_it = decode_base64( $3 );
! 
!             # for Japanese header
!             if (uc($1) eq "ISO-2022-JP") {
!                 Encode::from_to($decode_it, "iso-2022-jp", "euc-jp");
!             }
! 
              $mystring =~ s/=\?[\w-]+\?B\?(.*?)\?=/$decode_it/i;
          } else {
!             if ($2 eq "Q" || $2 eq "q") {
!                 $decode_it = $3;
                  $decode_it =~ s/\_/=20/g;
                  $decode_it = decode_qp( $decode_it );
+ 
+                 # for Japanese header
+                 if (uc($1) eq "ISO-2022-JP") {
+                     Encode::from_to($decode_it, "iso-2022-jp", "euc-jp");
+                 }
+ 
                  $mystring =~ s/=\?[\w-]+\?Q\?(.*?)\?=/$decode_it/i;
              }