|
From: <jgr...@us...> - 2003-10-06 14:24:48
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv10730/Classifier
Modified Files:
Bayes.pm MailParse.pm
Log Message:
Merged Korean patch
Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.203
retrieving revision 1.204
diff -C2 -d -r1.203 -r1.204
*** Bayes.pm 1 Oct 2003 14:34:28 -0000 1.203
--- Bayes.pm 6 Oct 2003 14:24:41 -0000 1.204
***************
*** 787,791 ****
# and may cause perl crash.
! if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
no locale;
for my $magnet (sort keys %{$self->{magnets__}{$bucket}{$type}}) {
--- 787,794 ----
# and may cause perl crash.
! # Disable the locale in Korean mode, too.
!
! if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ||
! $self->module_config_( 'html', 'language' ) eq 'Korean' ) {
no locale;
for my $magnet (sort keys %{$self->{magnets__}{$bucket}{$type}}) {
***************
*** 1570,1574 ****
return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr_euc($_,0,1)} grep {!/__POPFILE__(UNIQUE|TOTAL)__/} keys %{$self->{matrix__}{$bucket}};
} else {
! return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr($_,0,1)} grep {!/__POPFILE__(UNIQUE|TOTAL)__/} keys %{$self->{matrix__}{$bucket}};
}
}
--- 1573,1585 ----
return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr_euc($_,0,1)} grep {!/__POPFILE__(UNIQUE|TOTAL)__/} keys %{$self->{matrix__}{$bucket}};
} else {
! if ( $self->module_config_( 'html', 'language' ) eq 'Korean' ) {
! no locale;
! my $ksc5601 = '(?:[\xA1-\xFE][\xA1-\xFE])';
! my $eksc = "(?:$ksc5601|[\x81-\xC6][\x41-\xFE])";
!
! return grep {$_ ne $prev && ($prev = $_, 1)} sort map {$_ =~ /([\x20-\x80]|$eksc)/} grep {!/__POPFILE__(UNIQUE|TOTAL)__/} keys %{$self->{matrix__}{$bucket}};
! } else {
! return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr($_,0,1)} grep {!/__POPFILE__(UNIQUE|TOTAL)__/} keys %{$self->{matrix__}{$bucket}};
! }
}
}
***************
*** 1888,1892 ****
# and may cause perl crash.
! if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
no locale;
foreach my $word (keys %{$self->{parser__}->{words__}}) {
--- 1899,1905 ----
# and may cause perl crash.
! # Disable the locale in Korean mode, too.
! if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ||
! $self->module_config_( 'html', 'language' ) eq 'Korean' ) {
no locale;
foreach my $word (keys %{$self->{parser__}->{words__}}) {
***************
*** 1951,1955 ****
# and may cause perl crash.
! if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
no locale;
foreach my $word (keys %{$self->{parser__}->{words__}}) {
--- 1964,1970 ----
# and may cause perl crash.
! # Disable the locale in Korean mode, too.
! if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ||
! $self->module_config_( 'html', 'language' ) eq 'Korean' ) {
no locale;
foreach my $word (keys %{$self->{parser__}->{words__}}) {
***************
*** 2115,2119 ****
# and may cause perl crash.
! if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
no locale;
return sort keys %{$self->{magnets__}{$bucket}{$type}};
--- 2130,2137 ----
# and may cause perl crash.
! # Disable the locale in Korean mode, too.
!
! if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ||
! $self->module_config_( 'html', 'language' ) eq 'Korean' ) {
no locale;
return sort keys %{$self->{magnets__}{$bucket}{$type}};
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.166
retrieving revision 1.167
diff -C2 -d -r1.166 -r1.167
*** MailParse.pm 1 Oct 2003 14:34:28 -0000 1.166
--- MailParse.pm 6 Oct 2003 14:24:41 -0000 1.167
***************
*** 32,35 ****
--- 32,41 ----
use MIME::QuotedPrint;
+
+ # Korean characters definition
+
+ my $ksc5601 = '(?:[\xA1-\xFE][\xA1-\xFE])';
+ my $eksc = "(?:$ksc5601|[\x81-\xC6][\x41-\xFE])"; #extended ksc
+
# These are used for Japanese support
***************
*** 511,527 ****
}
}
-
} else {
! # Only care about words between 3 and 45 characters since short words like
! # an, or, if are too common and the longest word in English (according to
! # the OED) is pneumonoultramicroscopicsilicovolcanoconiosis
! while ( $line =~ s/([[:alpha:]][[:alpha:]\']{1,44})([_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)// ) {
! if ( ( $self->{in_headers__} == 0 ) && ( $self->{first20count__} < 20 ) ) {
! $self->{first20count__} += 1;
! $self->{first20__} .= " $1";
! }
! update_word($self,$1, $encoded, '', '[_\-,\.\"\'\)\?!:;\/ &\t\n\r]', $prefix) if (length $1 >= 3);
}
}
--- 517,551 ----
}
}
} else {
! if ( $self->{lang__} eq 'Korean' ) {
! # In Korean mode, [[:alpha:]] in regular expression is changed to 2bytes chars
! # to support 2 byte characters.
! #
! # In Korean, 1 character(2 bytes) words are meaningful, so care about
! # words between 2 and 45 characters.
! while ( $line =~ s/(([A-Za-z]|$eksc)([A-Za-z\']|$eksc){1,44})([_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)// ) {
! if ( ( $self->{in_headers__} == 0 ) && ( $self->{first20count__} < 20 ) ) {
! $self->{first20count__} += 1;
! $self->{first20__} .= " $1";
! }
!
! update_word($self,$1, $encoded, '', '[_\-,\.\"\'\)\?!:;\/ &\t\n\r]', $prefix) if (length $1 >= 2);
! }
! } else {
!
! # Only care about words between 3 and 45 characters since short words like
! # an, or, if are too common and the longest word in English (according to
! # the OED) is pneumonoultramicroscopicsilicovolcanoconiosis
!
! while ( $line =~ s/([[:alpha:]][[:alpha:]\']{1,44})([_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)// ) {
! if ( ( $self->{in_headers__} == 0 ) && ( $self->{first20count__} < 20 ) ) {
! $self->{first20count__} += 1;
! $self->{first20__} .= " $1";
! }
!
! update_word($self,$1, $encoded, '', '[_\-,\.\"\'\)\?!:;\/ &\t\n\r]', $prefix) if (length $1 >= 3);
! }
}
}
|