[popfile-commit] engine/Classifier Bayes.pm,1.207,1.208 MailParse.pm,1.167,1.168

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv5117/Classifier

Modified Files:
	Bayes.pm MailParse.pm 
Log Message:
Merge patches for Korean and Japanese support

Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.207
retrieving revision 1.208
diff -C2 -d -r1.207 -r1.208
*** Bayes.pm	9 Oct 2003 13:53:13 -0000	1.207
--- Bayes.pm	10 Oct 2003 14:59:32 -0000	1.208
***************
*** 50,53 ****
--- 50,62 ----
  use BerkeleyDB;
  
+ # Korean characters definition
+ 
+ my $ksc5601_sym = '(?:[\xA1-\xAC][\xA1-\xFE])';
+ my $ksc5601_han = '(?:[\xB0-\xC8][\xA1-\xFE])';
+ my $ksc5601_hanja  = '(?:[\xCA-\xFD][\xA1-\xFE])';
+ my $ksc5601 = "(?:$ksc5601_sym|$ksc5601_han|$ksc5601_hanja)";
+ 
+ my $eksc = "(?:$ksc5601|[\x81-\xC6][\x41-\xFE])"; #extended ksc
+ 
  #----------------------------------------------------------------------------
  # new
***************
*** 1675,1681 ****
          if  ( $self->module_config_( 'html', 'language' ) eq 'Korean' ) {
      	    no locale;
-             my $ksc5601 = '(?:[\xA1-\xFE][\xA1-\xFE])';
-             my $eksc = "(?:$ksc5601|[\x81-\xC6][\x41-\xFE])";
- 
              return grep {$_ ne $prev && ($prev = $_, 1)} sort map {$_ =~ /([\x20-\x80]|$eksc)/} grep {!/__POPFILE__(UNIQUE|TOTAL)__/} keys %{$self->{matrix__}{$bucket}};
          } else {
--- 1684,1687 ----

Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.167
retrieving revision 1.168
diff -C2 -d -r1.167 -r1.168
*** MailParse.pm	6 Oct 2003 14:24:41 -0000	1.167
--- MailParse.pm	10 Oct 2003 14:59:32 -0000	1.168
***************
*** 35,39 ****
  # Korean characters definition
  
! my $ksc5601 = '(?:[\xA1-\xFE][\xA1-\xFE])';
  my $eksc = "(?:$ksc5601|[\x81-\xC6][\x41-\xFE])"; #extended ksc
  
--- 35,43 ----
  # Korean characters definition
  
! my $ksc5601_sym = '(?:[\xA1-\xAC][\xA1-\xFE])';
! my $ksc5601_han = '(?:[\xB0-\xC8][\xA1-\xFE])';
! my $ksc5601_hanja  = '(?:[\xCA-\xFD][\xA1-\xFE])';
! my $ksc5601 = "(?:$ksc5601_sym|$ksc5601_han|$ksc5601_hanja)";
! 
  my $eksc = "(?:$ksc5601|[\x81-\xC6][\x41-\xFE])"; #extended ksc
  
***************
*** 430,434 ****
  
                  if ( defined( $to ) ) {
!                     $to         = chr($to);
                      $line       =~ s/$from/$to/g;
                      $self->{ut__} =~ s/$from/$to/g;
--- 434,445 ----
  
                  if ( defined( $to ) ) {
!                     
!                     # HTML entities confilict with DBCS chars. Replace entities with blanks.
!                     
!                     if ( $self->{lang__} eq 'Korean' ) {
!                     	$to = ' ';
!                     } else {
!                 	$to = chr($to);
!                     }
                      $line       =~ s/$from/$to/g;
                      $self->{ut__} =~ s/$from/$to/g;
***************
*** 523,528 ****
                      # to support 2 byte characters.
                      #
!                     # In Korean, 1 character(2 bytes) words are meaningful, so care about
!                     # words between 2 and 45 characters.
  
                      while ( $line =~ s/(([A-Za-z]|$eksc)([A-Za-z\']|$eksc){1,44})([_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)// ) {
--- 534,538 ----
                      # to support 2 byte characters.
                      #
!                     # In Korean, care about words between 2 and 45 characters.
  
                      while ( $line =~ s/(([A-Za-z]|$eksc)([A-Za-z\']|$eksc){1,44})([_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)// ) {