|
From: <jgr...@us...> - 2003-10-01 14:34:44
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv26845/Classifier
Modified Files:
Bayes.pm MailParse.pm WordMangle.pm
Log Message:
Merge updated Japanese support that fixes small bugs and removes the need for Encode for non-Japanese users
Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.202
retrieving revision 1.203
diff -C2 -d -r1.202 -r1.203
*** Bayes.pm 29 Sep 2003 23:19:45 -0000 1.202
--- Bayes.pm 1 Oct 2003 14:34:28 -0000 1.203
***************
*** 1560,1564 ****
my $prev = '';
! return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr($_,0,1)} grep {!/__POPFILE__(UNIQUE|TOTAL)__/} keys %{$self->{matrix__}{$bucket}};
}
--- 1560,1601 ----
my $prev = '';
!
! # In Japanese mode, disable locale and use substr_euc, the substr function
! # which supports EUC Japanese charset.
! # Sorting Japanese with "use locale" is memory and time consuming,
! # and may cause perl crash.
!
! if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
! no locale;
! return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr_euc($_,0,1)} grep {!/__POPFILE__(UNIQUE|TOTAL)__/} keys %{$self->{matrix__}{$bucket}};
! } else {
! return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr($_,0,1)} grep {!/__POPFILE__(UNIQUE|TOTAL)__/} keys %{$self->{matrix__}{$bucket}};
! }
! }
!
! # ---------------------------------------------------------------------------------------------
! #
! # substr_euc
! #
! # "substr" function which supports EUC Japanese charset
! #
! # $pos Start position
! # $len Word length
! #
! # ---------------------------------------------------------------------------------------------
! sub substr_euc {
! my ($str, $pos, $len) = @_;
! my $result_str;
! my $char;
! my $count=0;
! if(!$pos) { $pos=0; }
! if(!$len) { $len=length($str); }
! for ($pos = 0; $count<$len; $pos++) {
! $char = substr($str, $pos, 1);
! if ($char =~ /[\x80-\xff]/) { $char = substr($str, $pos++, 2); }
! $result_str .= $char;
! $count++;
! }
! return $result_str;
}
***************
*** 2186,2190 ****
my ( $self, $stopword ) = @_;
! return $self->{parser__}->{mangle__}->add_stopword( $stopword );
}
--- 2223,2229 ----
my ( $self, $stopword ) = @_;
! # Pass language parameter to add_stopword()
!
! return $self->{parser__}->{mangle__}->add_stopword( $stopword, $self->module_config_( 'html', 'language' ) );
}
***************
*** 2193,2197 ****
my ( $self, $stopword ) = @_;
! return $self->{parser__}->{mangle__}->remove_stopword( $stopword );
}
--- 2232,2238 ----
my ( $self, $stopword ) = @_;
! # Pass language parameter to remove_stopword()
!
! return $self->{parser__}->{mangle__}->remove_stopword( $stopword, $self->module_config_( 'html', 'language' ) );
}
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.165
retrieving revision 1.166
diff -C2 -d -r1.165 -r1.166
*** MailParse.pm 29 Sep 2003 23:19:45 -0000 1.165
--- MailParse.pm 1 Oct 2003 14:34:28 -0000 1.166
***************
*** 34,38 ****
# These are used for Japanese support
- use Encode;
my $ascii = '[\x00-\x7F]'; # ASCII chars
my $two_bytes_euc_jp = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])'; # 2bytes EUC-JP chars
--- 34,37 ----
***************
*** 1429,1432 ****
--- 1428,1432 ----
# =?charset?[BQ]?text?=
#
+ # $lang Pass in the current interface language for language specific encoding conversion
# A B indicates base64 encoding, a Q indicates quoted printable encoding
# ---------------------------------------------------------------------------------------------
***************
*** 1438,1442 ****
# the original string with it later. Thus, this subroutine returns the real decoded result.
! my ( $self, $mystring ) = @_;
my $decode_it = '';
--- 1438,1442 ----
# the original string with it later. Thus, this subroutine returns the real decoded result.
! my ( $self, $mystring, $lang ) = @_;
my $decode_it = '';
***************
*** 1447,1452 ****
# for Japanese header
! if (uc($1) eq "ISO-2022-JP") {
! Encode::from_to($decode_it, "iso-2022-jp", "euc-jp");
}
--- 1447,1452 ----
# for Japanese header
! if ((uc($1) eq "ISO-2022-JP") && ( $lang eq 'Nihongo' )) {
! $decode_it = convert_encoding($decode_it, "iso-2022-jp", "euc-jp");
}
***************
*** 1459,1464 ****
# for Japanese header
! if (uc($1) eq "ISO-2022-JP") {
! Encode::from_to($decode_it, "iso-2022-jp", "euc-jp");
}
--- 1459,1464 ----
# for Japanese header
! if ((uc($1) eq "ISO-2022-JP") && ( $lang eq 'Nihongo' )) {
! $decode_it = convert_encoding($decode_it, "iso-2022-jp", "euc-jp");
}
***************
*** 1781,1785 ****
}
! 1;
--- 1781,1803 ----
}
! # ---------------------------------------------------------------------------------------------
! #
! # convert_encoding
! #
! # Convert string from one encoding to another
! #
! # $string The string to be converted
! # $from Original encoding
! # $to The encoding which the string is converted to
! # ---------------------------------------------------------------------------------------------
! sub convert_encoding
! {
! my ( $string, $from, $to ) = @_;
! require Encode;
!
! Encode::from_to($string, $from, $to);
+ return $string;
+ }
+ 1;
Index: WordMangle.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/WordMangle.pm,v
retrieving revision 1.29
retrieving revision 1.30
diff -C2 -d -r1.29 -r1.30
*** WordMangle.pm 24 Aug 2003 03:30:08 -0000 1.29
--- WordMangle.pm 1 Oct 2003 14:34:28 -0000 1.30
***************
*** 29,32 ****
--- 29,39 ----
use locale;
+ # These are used for Japanese support
+
+ my $ascii = '[\x00-\x7F]'; # ASCII chars
+ my $two_bytes_euc_jp = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])'; # 2bytes EUC-JP chars
+ my $three_bytes_euc_jp = '(?:\x8F[\xA1-\xFE][\xA1-\xFE])'; # 3bytes EUC-JP chars
+ my $euc_jp = "(?:$ascii|$two_bytes_euc_jp|$three_bytes_euc_jp)"; # EUC-JP chars
+
#----------------------------------------------------------------------------
# new
***************
*** 146,153 ****
sub add_stopword
{
! my ( $self, $stopword ) = @_;
! if ( $stopword =~ /[^[:lower:]\-_\.\@0-9]/i ) {
! return 0;
}
--- 153,168 ----
sub add_stopword
{
! my ( $self, $stopword, $lang ) = @_;
! # In Japanese mode, reject non EUC Japanese characters.
!
! if ( $lang eq 'Nihongo') {
! if ( $stopword !~ /$euc_jp/i ) {
! return 0;
! }
! } else {
! if ( $stopword =~ /[^[:lower:]\-_\.\@0-9]/i ) {
! return 0;
! }
}
***************
*** 166,173 ****
sub remove_stopword
{
! my ( $self, $stopword ) = @_;
! if ( $stopword =~ /[^[:lower:]\-_\.\@0-9]/i ) {
! return 0;
}
--- 181,196 ----
sub remove_stopword
{
! my ( $self, $stopword, $lang ) = @_;
! # In Japanese mode, reject non EUC Japanese characters.
!
! if ( $lang eq 'Nihongo') {
! if ( $stopword !~ /$euc_jp/i ) {
! return 0;
! }
! } else {
! if ( $stopword =~ /[^[:lower:]\-_\.\@0-9]/i ) {
! return 0;
! }
}
|