|
From: <jgr...@us...> - 2003-09-22 13:27:26
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv21997/Classifier
Modified Files:
Bayes.pm MailParse.pm
Log Message:
Merge patch that makes POPFile work well with the Japanese language
Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.199
retrieving revision 1.200
diff -C2 -d -r1.199 -r1.200
*** Bayes.pm 22 Sep 2003 13:06:39 -0000 1.199
--- Bayes.pm 22 Sep 2003 13:27:21 -0000 1.200
***************
*** 783,791 ****
my $msg_total = 0;
$self->{magnet_used__} = 0;
$self->{magnet_detail__} = '';
if ( defined( $file ) ) {
! $self->{parser__}->parse_file( $file );
}
--- 783,793 ----
my $msg_total = 0;
+ # Pass language parameter to parse_file()
+
$self->{magnet_used__} = 0;
$self->{magnet_detail__} = '';
if ( defined( $file ) ) {
! $self->{parser__}->parse_file( $file, $self->module_config_( 'html', 'language' ) );
}
***************
*** 809,825 ****
$noattype =~ s/\$/__POPFILE_DOLLAR__/g;
! for my $magnet (sort keys %{$self->{magnets__}{$bucket}{$type}}) {
! my $regex;
! $regex = $magnet;
! $regex =~ s/@/__POPFILE_AT__/g;
! $regex =~ s/\$/__POPFILE_DOLLAR__/g;
! if ( $noattype =~ m/\Q$regex\E/i ) {
! $self->{scores__} = '';
! $self->{magnet_used__} = 1;
! $self->{magnet_detail__} = "$type: $magnet";
! return $bucket;
}
}
--- 811,850 ----
$noattype =~ s/\$/__POPFILE_DOLLAR__/g;
! # In Japanese mode, disable locale.
! # Sorting Japanese with "use locale" is memory and time consuming,
! # and may cause perl crash.
! if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
! no locale;
! for my $magnet (sort keys %{$self->{magnets__}{$bucket}{$type}}) {
! my $regex;
! $regex = $magnet;
! $regex =~ s/@/__POPFILE_AT__/g;
! $regex =~ s/\$/__POPFILE_DOLLAR__/g;
! if ( $noattype =~ m/\Q$regex\E/i ) {
! $self->{scores__} = '';
! $self->{magnet_used__} = 1;
! $self->{magnet_detail__} = "$type: $magnet";
!
! return $bucket;
! }
! }
! } else {
! for my $magnet (sort keys %{$self->{magnets__}{$bucket}{$type}}) {
! my $regex;
!
! $regex = $magnet;
! $regex =~ s/@/__POPFILE_AT__/g;
! $regex =~ s/\$/__POPFILE_DOLLAR__/g;
!
! if ( $noattype =~ m/\Q$regex\E/i ) {
! $self->{scores__} = '';
! $self->{magnet_used__} = 1;
! $self->{magnet_detail__} = "$type: $magnet";
!
! return $bucket;
! }
}
}
***************
*** 855,859 ****
# the values displayed in scores__; it has no effect on the classification
# process.
!
my $correction = 0;
--- 880,884 ----
# the values displayed in scores__; it has no effect on the classification
# process.
!
my $correction = 0;
***************
*** 874,878 ****
$score{$bucket} += ( $probability * $self->{parser__}{words__}{$word} );
}
!
if ($wmax > $self->{not_likely__}) {
$correction += $self->{not_likely__} * $self->{parser__}{words__}{$word};
--- 899,903 ----
$score{$bucket} += ( $probability * $self->{parser__}{words__}{$word} );
}
!
if ($wmax > $self->{not_likely__}) {
$correction += $self->{not_likely__} * $self->{parser__}{words__}{$word};
***************
*** 915,922 ****
}
-
$class = 'unsure' if ( $certainty < 0.4 );
-
# Compute the total of all the scores to generate the normalized scores and probability
# estimate. $total is always 1 after the first loop iteration, so any additional term
--- 940,945 ----
***************
*** 976,980 ****
$self->{scores__} .= "<a name=\"scores\">";
!
# If there are fewer than 2 buckets, there is no "verdict " to mention.
if (@buckets > 1) {
--- 999,1003 ----
$self->{scores__} .= "<a name=\"scores\">";
!
# If there are fewer than 2 buckets, there is no "verdict " to mention.
if (@buckets > 1) {
***************
*** 1018,1026 ****
$self->{scores__} .= "</table><hr>";
!
# We want a link to change the format here. But only the UI knows how to build
# that link. So we just insert a comment which can be replaced by the UI. There's
# probably a better way.
!
$self->{scores__} .= "<!--format--><p>";
$self->{scores__} .= "<table class=\"top20Words\">\n";
--- 1041,1049 ----
$self->{scores__} .= "</table><hr>";
!
# We want a link to change the format here. But only the UI knows how to build
# that link. So we just insert a comment which can be replaced by the UI. There's
# probably a better way.
!
$self->{scores__} .= "<!--format--><p>";
$self->{scores__} .= "<table class=\"top20Words\">\n";
***************
*** 1036,1043 ****
my %wordprobs;
!
# If the word matrix is supposed to show probabilities, compute them,
# saving the results in %wordprobs.
!
if ( $self->{wmformat__} eq 'prob') {
foreach my $word (keys %{$self->{parser__}->{words__}}) {
--- 1059,1066 ----
my %wordprobs;
!
# If the word matrix is supposed to show probabilities, compute them,
# saving the results in %wordprobs.
!
if ( $self->{wmformat__} eq 'prob') {
foreach my $word (keys %{$self->{parser__}->{words__}}) {
***************
*** 1053,1057 ****
}
}
!
my @ranked_words;
if ($self->{wmformat__} eq 'prob') {
--- 1076,1080 ----
}
}
!
my @ranked_words;
if ($self->{wmformat__} eq 'prob') {
***************
*** 1400,1403 ****
--- 1423,1432 ----
close TEMP;
+ # Parse Japanese mail message with Kakasi
+
+ if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
+ parse_with_kakasi( $self, $temp_file, $dcount, $mcount );
+ }
+
# If we don't yet know the classification then stop the parser
if ( $class eq '' ) {
***************
*** 1726,1730 ****
$self->{parser__}->{color__} = 1;
$self->{parser__}->{bayes__} = bless $self;
! my $result = $self->{parser__}->parse_file( $file );
$self->{parser__}->{color__} = 0;
--- 1755,1763 ----
$self->{parser__}->{color__} = 1;
$self->{parser__}->{bayes__} = bless $self;
!
! # Pass language parameter to parse_file()
!
! my $result = $self->{parser__}->parse_file( $file, $self->module_config_( 'html', 'language' ) );
!
$self->{parser__}->{color__} = 0;
***************
*** 1861,1869 ****
}
foreach my $file (@files) {
! $self->{parser__}->parse_file( $file );
! foreach my $word (keys %{$self->{parser__}->{words__}}) {
! $self->set_value_( $bucket, $word, $self->{parser__}->{words__}{$word} + $self->get_base_value_( $bucket, $word ) );
}
}
--- 1894,1915 ----
}
+ # Pass language parameter to parse_file()
+
foreach my $file (@files) {
! $self->{parser__}->parse_file( $file, $self->module_config_( 'html', 'language' ) );
! # In Japanese mode, disable locale.
! # Sorting Japanese with "use locale" is memory and time consuming,
! # and may cause perl crash.
!
! if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
! no locale;
! foreach my $word (keys %{$self->{parser__}->{words__}}) {
! $self->set_value_( $bucket, $word, $self->{parser__}->{words__}{$word} + $self->get_base_value_( $bucket, $word ) );
! }
! } else {
! foreach my $word (keys %{$self->{parser__}->{words__}}) {
! $self->set_value_( $bucket, $word, $self->{parser__}->{words__}{$word} + $self->get_base_value_( $bucket, $word ) );
! }
}
}
***************
*** 1912,1919 ****
}
! $self->{parser__}->parse_file( $file );
! foreach my $word (keys %{$self->{parser__}->{words__}}) {
! $self->set_value_( $bucket, $word, $self->get_base_value_( $bucket, $word ) - $self->{parser__}->{words__}{$word} );
}
--- 1958,1978 ----
}
! # Pass language parameter to parse_file()
! $self->{parser__}->parse_file( $file, $self->module_config_( 'html', 'language' ) );
!
! # In Japanese mode, disable locale.
! # Sorting Japanese with "use locale" is memory and time consuming,
! # and may cause perl crash.
!
! if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
! no locale;
! foreach my $word (keys %{$self->{parser__}->{words__}}) {
! $self->set_value_( $bucket, $word, $self->get_base_value_( $bucket, $word ) - $self->{parser__}->{words__}{$word} );
! }
! } else {
! foreach my $word (keys %{$self->{parser__}->{words__}}) {
! $self->set_value_( $bucket, $word, $self->get_base_value_( $bucket, $word ) - $self->{parser__}->{words__}{$word} );
! }
}
***************
*** 2067,2071 ****
my ( $self, $bucket, $type ) = @_;
! return sort keys %{$self->{magnets__}{$bucket}{$type}};
}
--- 2126,2139 ----
my ( $self, $bucket, $type ) = @_;
! # In Japanese mode, disable locale.
! # Sorting Japanese with "use locale" is memory and time consuming,
! # and may cause perl crash.
!
! if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) {
! no locale;
! return sort keys %{$self->{magnets__}{$bucket}{$type}};
! } else {
! return sort keys %{$self->{magnets__}{$bucket}{$type}};
! }
}
***************
*** 2204,2212 ****
return $self->{magnet_count__};
}
!
sub wmformat
{
my ( $self, $value ) = @_;
!
$self->{wmformat__} = $value if (defined $value);
return $self->{wmformat__};
--- 2272,2336 ----
return $self->{magnet_count__};
}
!
! # ---------------------------------------------------------------------------------------------
! #
! # parse_with_kakasi
! #
! # Parse Japanese mail message with Kakasi
! #
! # Japanese needs to be parsed by language processing filter, "Kakasi"
! # before it is passed to Bayes classifier because words are not splitted
! # by spaces.
! #
! # $file The file to parse
! #
! # ---------------------------------------------------------------------------------------------
! sub parse_with_kakasi
! {
! my ( $self, $file, $dcount, $mcount ) = @_;
!
! # This is used for Japanese support
! require Encode;
!
! # This is used to parse Japanese
! require Text::Kakasi;
!
! my $temp_file = $self->global_config_( 'msgdir' ) . "kakasi$dcount" . "=$mcount.msg";
! print $temp_file;
!
! # Split Japanese email body into words using Kakasi Wakachigaki
! # mode(-w is passed to Kakasi as argument). The most common charset of
! # Japanese email is ISO-2022-JP, alias is jis, so -ijis and -ojis
! # are passed to tell Kakasi the input charset and the output charset
! # explicitly.
! #
! # After Kakasi processing, Encode::from_to is used to convert into UTF-8.
! #
! # Japanese email charset is assumed to be ISO-2022-JP. Needs to expand for
! # other possible charset, such as Shift_JIS, EUC-JP, UTF-8.
!
! Text::Kakasi::getopt_argv("kakasi", "-w -ijis -ojis");
! open KAKASI_IN, "<$file";
! open KAKASI_OUT, ">$temp_file";
!
! while( <KAKASI_IN> ){
! my $kakasi_out;
!
! $kakasi_out = Text::Kakasi::do_kakasi($_);
! Encode::from_to($kakasi_out, "iso-2022-jp", "euc-jp");
! print KAKASI_OUT $kakasi_out;
! }
!
! close KAKASI_OUT;
! close KAKASI_IN;
! Text::Kakasi::close_kanwadict();
! unlink( $file );
! rename( $temp_file, $file );
! }
!
sub wmformat
{
my ( $self, $value ) = @_;
!
$self->{wmformat__} = $value if (defined $value);
return $self->{wmformat__};
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.161
retrieving revision 1.162
diff -C2 -d -r1.161 -r1.162
*** MailParse.pm 19 Sep 2003 20:18:52 -0000 1.161
--- MailParse.pm 22 Sep 2003 13:27:21 -0000 1.162
***************
*** 32,35 ****
--- 32,56 ----
use MIME::QuotedPrint;
+ # These are used for Japanese support
+
+ use Encode;
+ my $ascii = '[\x00-\x7F]'; # ASCII chars
+ my $two_bytes_euc_jp = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])'; # 2bytes EUC-JP chars
+ my $three_bytes_euc_jp = '(?:\x8F[\xA1-\xFE][\xA1-\xFE])'; # 3bytes EUC-JP chars
+ my $euc_jp = "(?:$ascii|$two_bytes_euc_jp|$three_bytes_euc_jp)"; # EUC-JP chars
+
+ # Symbols in EUC-JP chars which cannot be considered a part of words
+ my $symbol_row1_euc_jp = '(?:[\xA1][\xA1-\xBB\xBD-\xFE])';
+ my $symbol_row2_euc_jp = '(?:[\xA2][\xA1-\xFE])';
+ my $symbol_row8_euc_jp = '(?:[\xA8][\xA1-\xFE])';
+ my $symbol_euc_jp = "(?:$symbol_row1_euc_jp|$symbol_row2_euc_jp|$symbol_row8_euc_jp)";
+
+ # Cho-on kigou(symbol in Japanese), a special symbol which can appear in middle of words
+ my $cho_on_symbol = '(?:\xA1\xBC)';
+
+ # Non-symbol EUC-JP chars
+ my $non_symbol_two_bytes_euc_jp = '(?:[\x8E\xA3-\xA7\xB0-\xFE][\xA1-\xFE])';
+ my $non_symbol_euc_jp = "(?:$non_symbol_two_bytes_euc_jp|$three_bytes_euc_jp|$cho_on_symbol)";
+
# HTML entity mapping to character codes, this maps things like & to their corresponding
# character code
***************
*** 173,176 ****
--- 194,201 ----
$self->{html_arg__} = '';
$self->{in_headers__} = 0;
+
+ # This is used for switching on/off language specific functionality
+ $self->{lang__} = '';
+
$self->{first20__} = '';
***************
*** 462,476 ****
}
! # Only care about words between 3 and 45 characters since short words like
! # an, or, if are too common and the longest word in English (according to
! # the OED) is pneumonoultramicroscopicsilicovolcanoconiosis
! while ( $line =~ s/([[:alpha:]][[:alpha:]\']{1,44})([_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)// ) {
! if ( ( $self->{in_headers__} == 0 ) && ( $self->{first20count__} < 20 ) ) {
! $self->{first20count__} += 1;
! $self->{first20__} .= " $1";
}
! update_word($self,$1, $encoded, '', '[_\-,\.\"\'\)\?!:;\/ &\t\n\r]', $prefix) if (length $1 >= 3);
}
--- 487,529 ----
}
! if ( $self->{lang__} eq 'Nihongo' ) {
! # In Japanese mode, non-symbol EUC-JP characters should be
! # matched.
! #
! # ^$euc_jp*? is added to avoid incorrect matching.
! # For example, EUC-JP char represented by code A4C8, should not
! # match the middle of two EUC-JP chars represented by CCA4 and
! # C8BE, the second byte of the first char and the first byte of
! # the second char.
! while ( $line =~ s/^$euc_jp*?(([A-Za-z]|$non_symbol_euc_jp)([A-Za-z\']|$non_symbol_euc_jp){1,44})([_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)//ox ) {
! if ( ( $self->{in_headers__} == 0 ) && ( $self->{first20count__} < 20 ) ) {
! $self->{first20count__} += 1;
! $self->{first20__} .= " $1";
! }
!
! my $matched_word = $1;
!
! # In Japanese, 2 characters words are common, so care about
! # words between 2 and 45 characters
!
! if (((length $matched_word >= 3) && ($matched_word =~ /[A-Za-z]/)) || ((length $matched_word >= 2) && ($matched_word =~ /$non_symbol_euc_jp/))) {
! update_word($self, $matched_word, $encoded, '', '[_\-,\.\"\'\)\?!:;\/ &\t\n\r]'."|$symbol_euc_jp", $prefix);
! }
}
! } else {
! # Only care about words between 3 and 45 characters since short words like
! # an, or, if are too common and the longest word in English (according to
! # the OED) is pneumonoultramicroscopicsilicovolcanoconiosis
!
! while ( $line =~ s/([[:alpha:]][[:alpha:]\']{1,44})([_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)// ) {
! if ( ( $self->{in_headers__} == 0 ) && ( $self->{first20count__} < 20 ) ) {
! $self->{first20count__} += 1;
! $self->{first20__} .= " $1";
! }
!
! update_word($self,$1, $encoded, '', '[_\-,\.\"\'\)\?!:;\/ &\t\n\r]', $prefix) if (length $1 >= 3);
! }
}
***************
*** 1026,1034 ****
#
# $file The file to open and parse
#
# ---------------------------------------------------------------------------------------------
sub parse_file
{
! my ( $self, $file ) = @_;
$self->start_parse();
--- 1079,1091 ----
#
# $file The file to open and parse
+ # $lang Pass in the current interface language for language specific parsing
#
# ---------------------------------------------------------------------------------------------
sub parse_file
{
! # $lang is used for switching on/off language specific functionality
!
! my ( $self, $file, $lang ) = @_;
! $self->{lang__} = $lang;
$self->start_parse();
***************
*** 1385,1397 ****
my $decode_it = '';
! while ( $mystring =~ /=\?[\w-]+\?(B|Q)\?(.*?)\?=/ig ) {
! if ($1 eq "B" || $1 eq "b") {
! $decode_it = decode_base64( $2 );
$mystring =~ s/=\?[\w-]+\?B\?(.*?)\?=/$decode_it/i;
} else {
! if ($1 eq "Q" || $1 eq "q") {
! $decode_it = $2;
$decode_it =~ s/\_/=20/g;
$decode_it = decode_qp( $decode_it );
$mystring =~ s/=\?[\w-]+\?Q\?(.*?)\?=/$decode_it/i;
}
--- 1442,1466 ----
my $decode_it = '';
! while ( $mystring =~ /=\?([\w-]+)\?(B|Q)\?(.*?)\?=/ig ) {
! if ($2 eq "B" || $2 eq "b") {
! $decode_it = decode_base64( $3 );
!
! # for Japanese header
! if (uc($1) eq "ISO-2022-JP") {
! Encode::from_to($decode_it, "iso-2022-jp", "euc-jp");
! }
!
$mystring =~ s/=\?[\w-]+\?B\?(.*?)\?=/$decode_it/i;
} else {
! if ($2 eq "Q" || $2 eq "q") {
! $decode_it = $3;
$decode_it =~ s/\_/=20/g;
$decode_it = decode_qp( $decode_it );
+
+ # for Japanese header
+ if (uc($1) eq "ISO-2022-JP") {
+ Encode::from_to($decode_it, "iso-2022-jp", "euc-jp");
+ }
+
$mystring =~ s/=\?[\w-]+\?Q\?(.*?)\?=/$decode_it/i;
}
|