[popfile-commit] engine/Classifier Bayes.pm,1.202,1.203 MailParse.pm,1.165,1.166 WordMangle.pm,1.29,

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv26845/Classifier

Modified Files:
	Bayes.pm MailParse.pm WordMangle.pm 
Log Message:
Merge updated Japanese support that fixes small bugs and removes the need for Encode for non-Japanese users

Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.202
retrieving revision 1.203
diff -C2 -d -r1.202 -r1.203
*** Bayes.pm	29 Sep 2003 23:19:45 -0000	1.202
--- Bayes.pm	1 Oct 2003 14:34:28 -0000	1.203
***************
*** 1560,1564 ****
  
      my $prev = '';
!     return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr($_,0,1)} grep {!/__POPFILE__(UNIQUE|TOTAL)__/} keys %{$self->{matrix__}{$bucket}};
  }
  
--- 1560,1601 ----
  
      my $prev = '';
! 
!     # In Japanese mode, disable locale and use substr_euc, the substr function
!     # which supports EUC Japanese charset.
!     # Sorting Japanese with "use locale" is memory and time consuming,
!     # and may cause perl crash.
! 
!     if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
!         no locale;
!         return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr_euc($_,0,1)} grep {!/__POPFILE__(UNIQUE|TOTAL)__/} keys %{$self->{matrix__}{$bucket}};
!     } else {
!         return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr($_,0,1)} grep {!/__POPFILE__(UNIQUE|TOTAL)__/} keys %{$self->{matrix__}{$bucket}};
!     }
! }
! 
! # ---------------------------------------------------------------------------------------------
! #
! # substr_euc
! #
! # "substr" function which supports EUC Japanese charset
! #
! # $pos      Start position
! # $len      Word length
! #
! # ---------------------------------------------------------------------------------------------
! sub substr_euc {
!     my ($str, $pos, $len) = @_;
!     my $result_str;
!     my $char;
!     my $count=0;
!     if(!$pos) { $pos=0; }
!     if(!$len) { $len=length($str); }
!     for ($pos = 0; $count<$len; $pos++) {
!         $char = substr($str, $pos, 1);
!         if ($char =~ /[\x80-\xff]/) { $char = substr($str, $pos++, 2); }
!         $result_str .= $char;
!         $count++;
!     }
!     return $result_str;
  }
  
***************
*** 2186,2190 ****
      my ( $self, $stopword ) = @_;
  
!     return $self->{parser__}->{mangle__}->add_stopword( $stopword );
  }
  
--- 2223,2229 ----
      my ( $self, $stopword ) = @_;
  
!     # Pass language parameter to add_stopword()
! 
!     return $self->{parser__}->{mangle__}->add_stopword( $stopword, $self->module_config_( 'html', 'language' ) );
  }
  
***************
*** 2193,2197 ****
      my ( $self, $stopword ) = @_;
  
!     return $self->{parser__}->{mangle__}->remove_stopword( $stopword );
  }
  
--- 2232,2238 ----
      my ( $self, $stopword ) = @_;
  
!     # Pass language parameter to remove_stopword()
! 
!     return $self->{parser__}->{mangle__}->remove_stopword( $stopword, $self->module_config_( 'html', 'language' ) );
  }
  

Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.165
retrieving revision 1.166
diff -C2 -d -r1.165 -r1.166
*** MailParse.pm	29 Sep 2003 23:19:45 -0000	1.165
--- MailParse.pm	1 Oct 2003 14:34:28 -0000	1.166
***************
*** 34,38 ****
  # These are used for Japanese support
  
- use Encode;
  my $ascii = '[\x00-\x7F]'; # ASCII chars
  my $two_bytes_euc_jp = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])'; # 2bytes EUC-JP chars
--- 34,37 ----
***************
*** 1429,1432 ****
--- 1428,1432 ----
  # =?charset?[BQ]?text?=
  #
+ # $lang     Pass in the current interface language for language specific encoding conversion
  # A B indicates base64 encoding, a Q indicates quoted printable encoding
  # ---------------------------------------------------------------------------------------------
***************
*** 1438,1442 ****
      # the original string with it later. Thus, this subroutine returns the real decoded result.
  
!     my ( $self, $mystring ) = @_;
  
      my $decode_it = '';
--- 1438,1442 ----
      # the original string with it later. Thus, this subroutine returns the real decoded result.
  
!     my ( $self, $mystring, $lang ) = @_;
  
      my $decode_it = '';
***************
*** 1447,1452 ****
  
              # for Japanese header
!             if (uc($1) eq "ISO-2022-JP") {
!                 Encode::from_to($decode_it, "iso-2022-jp", "euc-jp");
              }
  
--- 1447,1452 ----
  
              # for Japanese header
!             if ((uc($1) eq "ISO-2022-JP") && ( $lang eq 'Nihongo' )) {
!                 $decode_it = convert_encoding($decode_it, "iso-2022-jp", "euc-jp");
              }
  
***************
*** 1459,1464 ****
  
                  # for Japanese header
!                 if (uc($1) eq "ISO-2022-JP") {
!                     Encode::from_to($decode_it, "iso-2022-jp", "euc-jp");
                  }
  
--- 1459,1464 ----
  
                  # for Japanese header
!                 if ((uc($1) eq "ISO-2022-JP") && ( $lang eq 'Nihongo' )) {
!                     $decode_it = convert_encoding($decode_it, "iso-2022-jp", "euc-jp");
                  }
  
***************
*** 1781,1785 ****
  }
  
! 1;
  
  
--- 1781,1803 ----
  }
  
! # ---------------------------------------------------------------------------------------------
! #
! # convert_encoding
! #
! # Convert string from one encoding to another
! #
! # $string       The string to be converted
! # $from         Original encoding
! # $to           The encoding which the string is converted to
! # ---------------------------------------------------------------------------------------------
! sub convert_encoding
! {
!     my ( $string, $from, $to ) = @_;
!     require Encode;
! 
!     Encode::from_to($string, $from, $to);
  
+     return $string;
+ }
  
+ 1;

Index: WordMangle.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/WordMangle.pm,v
retrieving revision 1.29
retrieving revision 1.30
diff -C2 -d -r1.29 -r1.30
*** WordMangle.pm	24 Aug 2003 03:30:08 -0000	1.29
--- WordMangle.pm	1 Oct 2003 14:34:28 -0000	1.30
***************
*** 29,32 ****
--- 29,39 ----
  use locale;
  
+ # These are used for Japanese support
+ 
+ my $ascii = '[\x00-\x7F]'; # ASCII chars
+ my $two_bytes_euc_jp = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])'; # 2bytes EUC-JP chars
+ my $three_bytes_euc_jp = '(?:\x8F[\xA1-\xFE][\xA1-\xFE])'; # 3bytes EUC-JP chars
+ my $euc_jp = "(?:$ascii|$two_bytes_euc_jp|$three_bytes_euc_jp)"; # EUC-JP chars
+ 
  #----------------------------------------------------------------------------
  # new
***************
*** 146,153 ****
  sub add_stopword
  {
!     my ( $self, $stopword ) = @_;
  
!     if ( $stopword =~ /[^[:lower:]\-_\.\@0-9]/i ) {
!         return 0;
      }
  
--- 153,168 ----
  sub add_stopword
  {
!     my ( $self, $stopword, $lang ) = @_;
  
!     # In Japanese mode, reject non EUC Japanese characters.
! 
!     if ( $lang eq 'Nihongo') {
!         if ( $stopword !~ /$euc_jp/i ) {
!             return 0;
!         }
!     } else {
!         if ( $stopword =~ /[^[:lower:]\-_\.\@0-9]/i ) {
!             return 0;
!         }
      }
  
***************
*** 166,173 ****
  sub remove_stopword
  {
!     my ( $self, $stopword ) = @_;
  
!     if ( $stopword =~ /[^[:lower:]\-_\.\@0-9]/i ) {
!         return 0;
      }
  
--- 181,196 ----
  sub remove_stopword
  {
!     my ( $self, $stopword, $lang ) = @_;
  
!     # In Japanese mode, reject non EUC Japanese characters.
! 
!     if ( $lang eq 'Nihongo') {
!         if ( $stopword !~ /$euc_jp/i ) {
!             return 0;
!         }
!     } else {
!         if ( $stopword =~ /[^[:lower:]\-_\.\@0-9]/i ) {
!             return 0;
!         }
      }

[popfile-commit] engine/Classifier Bayes.pm,1.202,1.203 MailParse.pm,1.165,1.166 WordMangle.pm,1.29,

[popfile-commit] engine/Classifier Bayes.pm,1.202,1.203 MailParse.pm,1.165,1.166 WordMangle.pm,1.29,1.30