|
From: <jgr...@us...> - 2003-10-10 14:59:42
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv5117/Classifier
Modified Files:
Bayes.pm MailParse.pm
Log Message:
Merge patches for Korean and Japanese support
Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.207
retrieving revision 1.208
diff -C2 -d -r1.207 -r1.208
*** Bayes.pm 9 Oct 2003 13:53:13 -0000 1.207
--- Bayes.pm 10 Oct 2003 14:59:32 -0000 1.208
***************
*** 50,53 ****
--- 50,62 ----
use BerkeleyDB;
+ # Korean characters definition
+
+ my $ksc5601_sym = '(?:[\xA1-\xAC][\xA1-\xFE])';
+ my $ksc5601_han = '(?:[\xB0-\xC8][\xA1-\xFE])';
+ my $ksc5601_hanja = '(?:[\xCA-\xFD][\xA1-\xFE])';
+ my $ksc5601 = "(?:$ksc5601_sym|$ksc5601_han|$ksc5601_hanja)";
+
+ my $eksc = "(?:$ksc5601|[\x81-\xC6][\x41-\xFE])"; #extended ksc
+
#----------------------------------------------------------------------------
# new
***************
*** 1675,1681 ****
if ( $self->module_config_( 'html', 'language' ) eq 'Korean' ) {
no locale;
- my $ksc5601 = '(?:[\xA1-\xFE][\xA1-\xFE])';
- my $eksc = "(?:$ksc5601|[\x81-\xC6][\x41-\xFE])";
-
return grep {$_ ne $prev && ($prev = $_, 1)} sort map {$_ =~ /([\x20-\x80]|$eksc)/} grep {!/__POPFILE__(UNIQUE|TOTAL)__/} keys %{$self->{matrix__}{$bucket}};
} else {
--- 1684,1687 ----
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.167
retrieving revision 1.168
diff -C2 -d -r1.167 -r1.168
*** MailParse.pm 6 Oct 2003 14:24:41 -0000 1.167
--- MailParse.pm 10 Oct 2003 14:59:32 -0000 1.168
***************
*** 35,39 ****
# Korean characters definition
! my $ksc5601 = '(?:[\xA1-\xFE][\xA1-\xFE])';
my $eksc = "(?:$ksc5601|[\x81-\xC6][\x41-\xFE])"; #extended ksc
--- 35,43 ----
# Korean characters definition
! my $ksc5601_sym = '(?:[\xA1-\xAC][\xA1-\xFE])';
! my $ksc5601_han = '(?:[\xB0-\xC8][\xA1-\xFE])';
! my $ksc5601_hanja = '(?:[\xCA-\xFD][\xA1-\xFE])';
! my $ksc5601 = "(?:$ksc5601_sym|$ksc5601_han|$ksc5601_hanja)";
!
my $eksc = "(?:$ksc5601|[\x81-\xC6][\x41-\xFE])"; #extended ksc
***************
*** 430,434 ****
if ( defined( $to ) ) {
! $to = chr($to);
$line =~ s/$from/$to/g;
$self->{ut__} =~ s/$from/$to/g;
--- 434,445 ----
if ( defined( $to ) ) {
!
! # HTML entities confilict with DBCS chars. Replace entities with blanks.
!
! if ( $self->{lang__} eq 'Korean' ) {
! $to = ' ';
! } else {
! $to = chr($to);
! }
$line =~ s/$from/$to/g;
$self->{ut__} =~ s/$from/$to/g;
***************
*** 523,528 ****
# to support 2 byte characters.
#
! # In Korean, 1 character(2 bytes) words are meaningful, so care about
! # words between 2 and 45 characters.
while ( $line =~ s/(([A-Za-z]|$eksc)([A-Za-z\']|$eksc){1,44})([_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)// ) {
--- 534,538 ----
# to support 2 byte characters.
#
! # In Korean, care about words between 2 and 45 characters.
while ( $line =~ s/(([A-Za-z]|$eksc)([A-Za-z\']|$eksc){1,44})([_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)// ) {
|