[popfile-commit] engine/Classifier Bayes.pm, 1.353, 1.354 MailParse.pm, 1.229, 1.230

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26968/Classifier

Modified Files:
	Bayes.pm MailParse.pm 
Log Message:
Japanese users now choose the Japanese parser by changing 'bayes_nihongo_parser' option.
Performance improvement for Japanese users.
Decode the encoded attatchment file names.

Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.353
retrieving revision 1.354
diff -C2 -d -r1.353 -r1.354
*** Bayes.pm	26 Nov 2007 15:22:48 -0000	1.353
--- Bayes.pm	27 Nov 2007 14:44:56 -0000	1.354
***************
*** 230,233 ****
--- 230,238 ----
      $self->config_( 'localhostname', '' );

+     # Japanese wakachigaki parser ('kakasi' or 'mecab' or 'internal').
+     # TODO: Users can choose the parser engine to use?
+ 
+     $self->config_( 'nihongo_parser', 'kakasi' );
+ 
      $self->mq_register_( 'COMIT', $self );
      $self->mq_register_( 'RELSE', $self );
***************
*** 310,326 ****
      # Windows and using the fork.

!     if ( ( $self->{parser__}->{lang__} eq 'Nihongo' ) && ( $^O eq 'MSWin32' ) && 
!          ( ( ( $self->user_module_config_( 1, 'pop3', 'enabled' ) ) && 
!              ( $self->user_module_config_( 1, 'pop3', 'force_fork' ) ) ) || 
!            ( ( $self->user_module_config_( 1, 'nntp', 'enabled' ) ) && 
!              ( $self->user_module_config_( 1, 'nntp', 'force_fork' ) ) ) || 
!            ( ( $self->user_module_config_( 1, 'smtp', 'enabled' ) ) && 
!              ( $self->user_module_config_( 1, 'smtp', 'force_fork' ) ) ) ) ) {
!         $self->{parser__}->{need_kakasi_mutex__} = 1;

!         # Prepare the Mutex.
!         require POPFile::Mutex;
!         $self->{parser__}->{kakasi_mutex__} = new POPFile::Mutex( 'mailparse_kakasi' );
!         $self->log_( 2, "Create mutex for Kakasi." );
      }

--- 315,347 ----
      # Windows and using the fork.

!     if ( $self->{parser__}->{lang__} eq 'Nihongo' ) {
!         # Setup Nihongo (Japanese) parser.

!         my $nihongo_parser = $self->config_( 'nihongo_parser' );
! 
!         $nihongo_parser = $self->{parser__}->setup_nihongo_parser( $nihongo_parser );
! 
!         $self->log_( 2, "Use Nihongo (Japanese) parser : $nihongo_parser" );
!         $self->config_( 'nihongo_parser', $nihongo_parser );
! 
!         # Since Text::Kakasi is not thread-safe, we use it under the
!         # control of a Mutex to avoid a crash if we are running on
!         # Windows and using the fork.
! 
!         if ( ( $nihongo_parser eq 'kakasi' ) && ( $^O eq 'MSWin32' ) && 
!              ( ( ( $self->user_module_config_( 1, 'pop3', 'enabled' ) ) && 
!                  ( $self->user_module_config_( 1, 'pop3', 'force_fork' ) ) ) || 
!                ( ( $self->user_module_config_( 1, 'nntp', 'enabled' ) ) && 
!                  ( $self->user_module_config_( 1, 'nntp', 'force_fork' ) ) ) || 
!                ( ( $self->user_module_config_( 1, 'smtp', 'enabled' ) ) && 
!                  ( $self->user_module_config_( 1, 'smtp', 'force_fork' ) ) ) ) ) {
! 
!             $self->{parser__}->{need_kakasi_mutex__} = 1;
! 
!             # Prepare the Mutex.
!             require POPFile::Mutex;
!             $self->{parser__}->{kakasi_mutex__} = new POPFile::Mutex( 'mailparse_kakasi' );
!             $self->log_( 2, "Create mutex for Kakasi." );
!         }
      }

Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.229
retrieving revision 1.230
diff -C2 -d -r1.229 -r1.230
*** MailParse.pm	26 Nov 2007 15:22:48 -0000	1.229
--- MailParse.pm	27 Nov 2007 14:44:56 -0000	1.230
***************
*** 70,73 ****
--- 70,92 ----
  my $non_symbol_euc_jp = "(?:$non_symbol_two_bytes_euc_jp|$three_bytes_euc_jp|$cho_on_symbol)";

+ # Constants for the internal wakachigaki parser.
+ # Kind of EUC-JP chars
+ my $euc_jp_symbol = '[\xA1\xA2\xA6-\xA8\xAD\xF9-\xFC][\xA1-\xFE]'; # The symbols make a word of one character.
+ my $euc_jp_alphanum = '(?:\xA3[\xB0-\xB9\xC1-\xDA\xE1-\xFA])+'; # One or more alphabets and numbers
+ my $euc_jp_hiragana = '(?:(?:\xA4[\xA1-\xF3])+(?:\xA1[\xAB\xAC\xB5\xB6\xBC])*)+'; # One or more Hiragana characters
+ my $euc_jp_katakana = '(?:(?:\xA5[\xA1-\xF6])+(?:\xA1[\xA6\xBC\xB3\xB4])*)+'; # One or more Katakana characters
+ my $euc_jp_hkatakana = '(?:\x8E[\xA6-\xDF])+'; # One or more Half-width Katakana characters
+ my $euc_jp_kanji = '[\xB0-\xF4][\xA1-\xFE](?:[\xB0-\xF4][\xA1-\xFE]|\xA1\xB9)?'; # One or two Kanji characters
+ 
+ my $euc_jp_word = '(' . 
+     $euc_jp_alphanum . '|' . 
+     $euc_jp_hiragana . '|' . 
+     $euc_jp_katakana . '|' . 
+     $euc_jp_hkatakana . '|' . 
+     $euc_jp_kanji . '|' . 
+     $euc_jp_symbol . '|' . 
+     $ascii . '+|' .
+     $three_bytes_euc_jp . ')';
+ 
  # HTML entity mapping to character codes, this maps things like &amp;
  # to their corresponding character code
***************
*** 231,234 ****
--- 250,256 ----
      $self->{prev__} = '';

+     # Object for the Nihongo (Japanese) parser.
+     $self->{nihongo_parser__} = undef;
+ 
      return $result;
  }
***************
*** 667,671 ****
                  # first char and the first byte of the second char.

!                 while ( $line =~ s/^$euc_jp*?(([A-Za-z]|$non_symbol_euc_jp)([A-Za-z\']|$non_symbol_euc_jp){1,44})([_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)//ox ) {
                      if ( ( $self->{in_headers__} == 0 ) && ( $self->{first20count__} < 20 ) ) {
                          $self->{first20count__} += 1;
--- 689,696 ----
                  # first char and the first byte of the second char.

!                 # In Japanese, one character words are common, so care about
!                 # words between 2 and 45 characters
! 
!                 while ( $line =~ s/^$euc_jp*?([A-Za-z][A-Za-z\']{2,44}|$non_symbol_euc_jp{2,45})(?:[_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)//o ) {
                      if ( ( $self->{in_headers__} == 0 ) && ( $self->{first20count__} < 20 ) ) {
                          $self->{first20count__} += 1;
***************
*** 673,684 ****
                      }

!                     my $matched_word = $1;
! 
!                     # In Japanese, 2 characters words are common, so
!                     # care about words between 2 and 45 characters
! 
!                     if (((length $matched_word >= 3) && ($matched_word =~ /[A-Za-z]/)) || ((length $matched_word >= 2) && ($matched_word =~ /$non_symbol_euc_jp/))) {
!                         update_word($self, $matched_word, $encoded, '', '[_\-,\.\"\'\)\?!:;\/ &\t\n\r]'."|$symbol_euc_jp", $prefix);
!                     }
                  }
              } else {
--- 698,702 ----
                      }

!                     update_word($self, $1, $encoded, '', '[_\-,\.\"\'\)\?!:;\/ &\t\n\r]|'.$symbol_euc_jp, $prefix);
                  }
              } else {
***************
*** 1583,1591 ****
      $self->{charset__} = '';

-     # Since Text::Kakasi is not thread-safe, we use it under the
-     # control of a Mutex to avoid a crash if we are running on
-     # Windows.
- 
      if ( $self->{lang__} eq 'Nihongo' ) {
          if ( $self->{need_kakasi_mutex__} ) {
              require POPFile::Mutex;
--- 1601,1609 ----
      $self->{charset__} = '';

      if ( $self->{lang__} eq 'Nihongo' ) {
+ 
+         # Since Text::Kakasi is not thread-safe, we use it under the
+         # control of a Mutex to avoid a crash if we are running on
+         # Windows.
          if ( $self->{need_kakasi_mutex__} ) {
              require POPFile::Mutex;
***************
*** 1593,1598 ****
          }

!         # Open Kakasi dictionary and initialize
!         init_kakasi();
      }
  }
--- 1611,1616 ----
          }

!         # Initialize Nihongo (Japanese) parser
!         $self->{nihongo_parser__}{init}( $self );
      }
  }
***************
*** 1636,1641 ****

      if ( $self->{lang__} eq 'Nihongo' ) {
!         # Close Kakasi dictionary
!         close_kakasi();

          if ( $self->{need_kakasi_mutex__} ) {
--- 1654,1659 ----

      if ( $self->{lang__} eq 'Nihongo' ) {
!         # Close Nihongo (Japanese) parser
!         $self->{nihongo_parser__}{close}( $self );

          if ( $self->{need_kakasi_mutex__} ) {
***************
*** 1676,1682 ****
              if ( !$self->{in_headers__} && $self->{encoding__} =~ /quoted\-printable/i) {
                  if ( $self->{lang__} eq 'Nihongo') {
!                     if ( $line =~ /=\r\n$/ ) {
                          # Encoded in multiple lines
-                         $line =~ s/=\r\n$//g;
                          $self->{prev__} .= $line;
                          next;
--- 1694,1699 ----
              if ( !$self->{in_headers__} && $self->{encoding__} =~ /quoted\-printable/i) {
                  if ( $self->{lang__} eq 'Nihongo') {
!                     if ( $line =~ s/=\r\n$// ) {
                          # Encoded in multiple lines
                          $self->{prev__} .= $line;
                          next;
***************
*** 1690,1701 ****
              }

!             if ( $self->{lang__} eq 'Nihongo' ) {
                  # Decode \x??
!                 if ( !$self->{in_headers__} ) {
!                     $line =~ s/\\x([8-9A-F][A-F0-9])/pack("C", hex($1))/eig;
!                 }

                  $line = convert_encoding( $line, $self->{charset__}, 'euc-jp', '7bit-jis', @{$encoding_candidates{$self->{lang__}}} );
!                 $line = parse_line_with_kakasi( $self, $line );
              }

--- 1707,1716 ----
              }

!             if ( $self->{lang__} eq 'Nihongo' && !$self->{in_headers__} && $self->{encoding__} !~ /base64/i ) {
                  # Decode \x??
!                 $line =~ s/\\x([8-9A-F][A-F0-9])/pack("C", hex($1))/eig;

                  $line = convert_encoding( $line, $self->{charset__}, 'euc-jp', '7bit-jis', @{$encoding_candidates{$self->{lang__}}} );
!                 $line = $self->{nihongo_parser__}{parse}( $self, $line );
              }

***************
*** 1869,1873 ****
          if ( $self->{lang__} eq 'Nihongo' ) {
              $decoded = convert_encoding( $decoded, $self->{charset__}, 'euc-jp', '7bit-jis', @{$encoding_candidates{$self->{lang__}}} );
!             $decoded = parse_line_with_kakasi( $self, $decoded );
          }

--- 1884,1888 ----
          if ( $self->{lang__} eq 'Nihongo' ) {
              $decoded = convert_encoding( $decoded, $self->{charset__}, 'euc-jp', '7bit-jis', @{$encoding_candidates{$self->{lang__}}} );
!             $decoded = $self->{nihongo_parser__}{parse}( $self, $decoded );
          }

***************
*** 2093,2099 ****
          if ( $self->{subject__} eq '' ) {

!             # In Japanese mode, parse subject with kakasi

!             $argument = parse_line_with_kakasi( $self, $argument ) if ( $self->{lang__} eq 'Nihongo' && $argument ne '' );

              $self->{subject__} = $argument;
--- 2108,2114 ----
          if ( $self->{subject__} eq '' ) {

!             # In Japanese mode, parse subject with Nihongo (Japanese) parser

!             $argument = $self->{nihongo_parser__}{parse}( $self, $argument ) if ( $self->{lang__} eq 'Nihongo' && $argument ne '' );

              $self->{subject__} = $argument;
***************
*** 2457,2460 ****
--- 2472,2478 ----
          print "Add filename $filename\n" if $self->{debug__};

+         # Decode the filename
+         $filename = $self->decode_string( $filename );
+ 
          my ( $name, $ext ) = $self->file_extension( $filename );

***************
*** 2484,2488 ****
      my ( $attachment, $filename ) = $self->match_attachment_filename( $params );

!     if ( $attachment eq 'attachment' ) {
          $self->add_attachment_filename( $filename ) ;
      }
--- 2502,2506 ----
      my ( $attachment, $filename ) = $self->match_attachment_filename( $params );

!     if ( defined( $attachment ) && ( $attachment eq 'attachment' ) ) {
          $self->add_attachment_filename( $filename ) ;
      }
***************
*** 2562,2566 ****

      if(ref $enc){
!        $from= $enc->name;
      } else {

--- 2580,2584 ----

      if(ref $enc){
!         $from = $enc->name;
      } else {

***************
*** 2580,2584 ****
          # Workaround for Encode::Unicode error bug.
          eval {
!             Encode::from_to($string, $from, $to);
          };
          $string = $orig_string if ($@);
--- 2598,2606 ----
          # Workaround for Encode::Unicode error bug.
          eval {
!             if (ref $enc) {
!                 $string = Encode::encode($to, $enc->decode($string));
!             } else {
!                 Encode::from_to($string, $from, $to);
!             }
          };
          $string = $orig_string if ($@);
***************
*** 2607,2613 ****
      return $line if ( $line =~ /^[\x00-\x7F]*$/ );

-     # This is used to parse Japanese
-     require Text::Kakasi;
- 
      # Split Japanese line into words using Kakasi Wakachigaki mode
      $line = Text::Kakasi::do_kakasi($line);
--- 2629,2632 ----
***************
*** 2618,2621 ****
--- 2637,2692 ----
  # ----------------------------------------------------------------------------
  #
+ # parse_line_with_mecab
+ #
+ # Parse a line with MeCab
+ #
+ # Split Japanese words by spaces using "MeCab" - Yet Another Part-of-Speech 
+ # and Morphological Analyzer.
+ #
+ # $line          The line to be parsed
+ #
+ # ----------------------------------------------------------------------------
+ sub parse_line_with_mecab
+ {
+     my ( $self, $line ) = @_;
+ 
+     # If the line does not contain Japanese characters, do nothing
+     return $line if ( $line =~ /^[\x00-\x7F]*$/ );
+ 
+     # Split Japanese line into words using MeCab
+     $line = $self->{nihongo_parser__}{obj_mecab}->parse($line);
+ 
+     # Remove the unnecessary white spaces
+     $line =~ s/([\x00-\x1f\x21-\x7f]) (?=[\x00-\x1f\x21-\x7f])/$1/g;
+ 
+     return $line;
+ }
+ 
+ # ----------------------------------------------------------------------------
+ #
+ # parse_line_with_internal_parser
+ #
+ # Parse a line with an internal perser
+ #
+ # Split characters by kind of the character
+ #
+ # $line          The line to be parsed
+ #
+ # ----------------------------------------------------------------------------
+ sub parse_line_with_internal_parser
+ {
+     my ( $self, $line ) = @_;
+ 
+     # If the line does not contain Japanese characters, do nothing
+     return $line if ( $line =~ /^[\x00-\x7F]*$/ );
+ 
+     # Split Japanese line into words by the kind of characters
+     $line =~ s/\G$euc_jp_word/$1 /og;
+ 
+     return $line;
+ }
+ 
+ # ----------------------------------------------------------------------------
+ #
  # init_kakasi
  #
***************
*** 2625,2635 ****
  sub init_kakasi
  {
-     require Text::Kakasi;
- 
      # Initialize Kakasi with Wakachigaki mode(-w is passed to 
      # Kakasi as argument). Both input and ouput encoding are 
      # EUC-JP.

!     Text::Kakasi::getopt_argv("kakasi", "-w", "-ieuc", "-oeuc");
  }

--- 2696,2722 ----
  sub init_kakasi
  {
      # Initialize Kakasi with Wakachigaki mode(-w is passed to 
      # Kakasi as argument). Both input and ouput encoding are 
      # EUC-JP.

!     Text::Kakasi::getopt_argv('kakasi', '-w', '-ieuc', '-oeuc');
! }
! 
! # ----------------------------------------------------------------------------
! #
! # init_mecab
! #
! # Create a new parser object of MeCab.
! #
! # ----------------------------------------------------------------------------
! sub init_mecab
! {
!     my ( $self ) = @_;
! 
!     # Initialize MeCab (-F %M\s -U %M\s -E \n is passed to MeCab as argument).
!     # Insert white spaces after words.
! 
!     $self->{nihongo_parser__}{obj_mecab} 
!         = MeCab::Tagger->new('-F %M\s -U %M\s -E \n');
  }

***************
*** 2643,2651 ****
  sub close_kakasi
  {
-     require Text::Kakasi;
- 
      Text::Kakasi::close_kanwadict();
  }

  1;
--- 2730,2823 ----
  sub close_kakasi
  {
      Text::Kakasi::close_kanwadict();
  }

+ # ----------------------------------------------------------------------------
+ #
+ # close_mecab
+ #
+ # Free the parser object of MeCab.
+ #
+ # ----------------------------------------------------------------------------
+ sub close_mecab
+ {
+     my ( $self ) = @_;
+ 
+     $self->{nihongo_parser__}{obj_mecab} = undef;
+ }
+ 
+ # ----------------------------------------------------------------------------
+ #
+ # setup_nihongo_parser
+ #
+ # Check whether Nihongo (Japanese) parsers are available and setup subroutines.
+ #
+ # $nihongo_parser  Nihongo (Japanese) parser to use
+ #                  ( kakasi / mecab / internal )
+ #
+ # ----------------------------------------------------------------------------
+ sub setup_nihongo_parser
+ {
+     my ( $self, $nihongo_parser ) = @_;
+ 
+     # If MeCab is installed, use MeCab.
+     if ( $nihongo_parser eq 'mecab' ) {
+         my $has_mecab = 0;
+ 
+         foreach my $prefix (@INC) {
+             my $realfilename = "$prefix/MeCab.pm";
+             if (-f $realfilename) {
+                 $has_mecab = 1;
+                 last;
+             }
+         }
+ 
+         # If MeCab is not installed, try to use Text::Kakasi.
+         $nihongo_parser = 'kakasi' unless ( $has_mecab );
+     }
+ 
+     # If Text::Kakasi is installed, use Text::Kakasi.
+     if ( $nihongo_parser eq 'kakasi' ) {
+         my $has_kakasi = 0;
+ 
+         foreach my $prefix (@INC) {
+             my $realfilename = "$prefix/Text/Kakasi.pm";
+             if (-f $realfilename) {
+                 $has_kakasi = 1;
+                 last;
+             }
+         }
+ 
+         # If Kakasi is not installed, use the internal parser.
+         $nihongo_parser = 'internal' unless ( $has_kakasi );
+     }
+ 
+     # Setup perser's subroutines
+     if ( $nihongo_parser eq 'mecab' ) {
+         # Import MeCab module
+         require MeCab;
+         import MeCab;
+ 
+         $self->{nihongo_parser__}{init} = \&init_mecab;
+         $self->{nihongo_parser__}{parse} = \&parse_line_with_mecab;
+         $self->{nihongo_parser__}{close} = \&close_mecab;
+     } elsif ( $nihongo_parser eq 'kakasi' ) {
+         # Import Text::Kakasi module
+         require Text::Kakasi;
+         import Text::Kakasi;
+ 
+         $self->{nihongo_parser__}{init} = \&init_kakasi;
+         $self->{nihongo_parser__}{parse} = \&parse_line_with_kakasi;
+         $self->{nihongo_parser__}{close} = \&close_kakasi;
+     } else {
+         # Require no external modules
+         $self->{nihongo_parser__}{init} = sub {}; # Needs no initialization
+         $self->{nihongo_parser__}{parse} = \&parse_line_with_internal_parser;
+         $self->{nihongo_parser__}{close} = sub {};
+     }
+ 
+     return $nihongo_parser;
+ }
+ 

  1;