From: naoki i. <am...@us...> - 2007-09-06 16:31:40
|
Update of /cvsroot/popfile/engine/Classifier In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv15410/Classifier Modified Files: Tag: b0_22_2 MailParse.pm Bayes.pm Log Message: Japanese users now choose the Japanese parser by changing 'bayes_nihongo_parser' option. Performance improvement for Japanese users. Decode the encoded attatchment file names. Index: Bayes.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v retrieving revision 1.327.4.8 retrieving revision 1.327.4.9 diff -C2 -d -r1.327.4.8 -r1.327.4.9 *** Bayes.pm 31 Aug 2007 14:13:48 -0000 1.327.4.8 --- Bayes.pm 6 Sep 2007 16:31:41 -0000 1.327.4.9 *************** *** 274,277 **** --- 274,280 ---- $self->config_( 'sqlite_tweaks', 0xFFFFFFFF ); + # Japanese wakachigaki parser ('kakasi' or 'mecab' or 'internal'). + $self->config_( 'nihongo_parser', 'kakasi' ); + $self->mq_register_( 'COMIT', $self ); $self->mq_register_( 'RELSE', $self ); *************** *** 345,365 **** } ! # Since Text::Kakasi is not thread-safe, we use it under the ! # control of a Mutex to avoid a crash if we are running on ! # Windows and using the fork. ! if ( ( $self->{parser__}->{lang__} eq 'Nihongo' ) && ( $^O eq 'MSWin32' ) && ! ( ( ( $self->module_config_( 'pop3', 'enabled' ) ) && ! ( $self->module_config_( 'pop3', 'force_fork' ) ) ) || ! ( ( $self->module_config_( 'nntp', 'enabled' ) ) && ! ( $self->module_config_( 'nntp', 'force_fork' ) ) ) || ! ( ( $self->module_config_( 'smtp', 'enabled' ) ) && ! ( $self->module_config_( 'smtp', 'force_fork' ) ) ) ) ) { ! $self->{parser__}->{need_kakasi_mutex__} = 1; ! # Prepare the Mutex. ! require POPFile::Mutex; ! $self->{parser__}->{kakasi_mutex__} = new POPFile::Mutex( 'mailparse_kakasi' ); ! $self->log_( 2, "Create mutex for Kakasi." ); } --- 348,379 ---- } ! if ( $self->{parser__}->{lang__} eq 'Nihongo' ) { ! # Setup Nihongo (Japanese) parser. ! my $nihongo_parser = $self->config_( 'nihongo_parser' ); ! $nihongo_parser = $self->{parser__}->setup_nihongo_parser( $nihongo_parser ); ! ! $self->log_( 2, "Use Nihongo (Japanese) parser : $nihongo_parser" ); ! $self->config_( 'nihongo_parser', $nihongo_parser ); ! ! # Since Text::Kakasi is not thread-safe, we use it under the ! # control of a Mutex to avoid a crash if we are running on ! # Windows and using the fork. ! ! if ( ( $nihongo_parser eq 'kakasi' ) && ( $^O eq 'MSWin32' ) && ! ( ( ( $self->module_config_( 'pop3', 'enabled' ) ) && ! ( $self->module_config_( 'pop3', 'force_fork' ) ) ) || ! ( ( $self->module_config_( 'nntp', 'enabled' ) ) && ! ( $self->module_config_( 'nntp', 'force_fork' ) ) ) || ! ( ( $self->module_config_( 'smtp', 'enabled' ) ) && ! ( $self->module_config_( 'smtp', 'force_fork' ) ) ) ) ) { ! $self->{parser__}->{need_kakasi_mutex__} = 1; ! ! # Prepare the Mutex. ! require POPFile::Mutex; ! $self->{parser__}->{kakasi_mutex__} = new POPFile::Mutex( 'mailparse_kakasi' ); ! $self->log_( 2, "Create mutex for Kakasi." ); ! } } Index: MailParse.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v retrieving revision 1.216.4.9 retrieving revision 1.216.4.10 diff -C2 -d -r1.216.4.9 -r1.216.4.10 *** MailParse.pm 6 Sep 2007 16:18:58 -0000 1.216.4.9 --- MailParse.pm 6 Sep 2007 16:31:41 -0000 1.216.4.10 *************** *** 44,48 **** my %encoding_candidates = ( ! 'Nihongo' => [ 'shiftjis', 'euc-jp', '7bit-jis' ] ); --- 44,48 ---- my %encoding_candidates = ( ! 'Nihongo' => [ 'cp932', 'euc-jp', '7bit-jis' ] ); *************** *** 65,68 **** --- 65,87 ---- my $non_symbol_euc_jp = "(?:$non_symbol_two_bytes_euc_jp|$three_bytes_euc_jp|$cho_on_symbol)"; + # Constants for the internal wakachigaki parser. + # Kind of EUC-JP chars + my $euc_jp_symbol = '[\xA1\xA2\xA6-\xA8\xAD\xF9-\xFC][\xA1-\xFE]'; # The symbols make a word of one character. + my $euc_jp_alphanum = '(?:\xA3[\xB0-\xB9\xC1-\xDA\xE1-\xFA])+'; # One or more alphabets and numbers + my $euc_jp_hiragana = '(?:(?:\xA4[\xA1-\xF3])+(?:\xA1[\xAB\xAC\xB5\xB6\xBC])*)+'; # One or more Hiragana characters + my $euc_jp_katakana = '(?:(?:\xA5[\xA1-\xF6])+(?:\xA1[\xA6\xBC\xB3\xB4])*)+'; # One or more Katakana characters + my $euc_jp_hkatakana = '(?:\x8E[\xA6-\xDF])+'; # One or more Half-width Katakana characters + my $euc_jp_kanji = '[\xB0-\xF4][\xA1-\xFE](?:[\xB0-\xF4][\xA1-\xFE]|\xA1\xB9)?'; # One or two Kanji characters + + my $euc_jp_word = '(' . + $euc_jp_alphanum . '|' . + $euc_jp_hiragana . '|' . + $euc_jp_katakana . '|' . + $euc_jp_hkatakana . '|' . + $euc_jp_kanji . '|' . + $euc_jp_symbol . '|' . + $ascii . '+|' . + $three_bytes_euc_jp . ')'; + # HTML entity mapping to character codes, this maps things like & to their corresponding # character code *************** *** 221,227 **** $self->{first20__} = ''; ! # For support Quoted Printable in Japanese text, save encoded text in multiple lines $self->{prev__} = ''; return $result; } --- 240,250 ---- $self->{first20__} = ''; ! # For support Quoted Printable in Japanese text, save encoded text in ! # multiple lines $self->{prev__} = ''; + # Object for the Nihongo (Japanese) parser. + $self->{nihongo_parser__} = undef; + return $result; } *************** *** 648,652 **** # the second char. ! while ( $line =~ s/^$euc_jp*?(([A-Za-z]|$non_symbol_euc_jp)([A-Za-z\']|$non_symbol_euc_jp){1,44})([_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)//ox ) { if ( ( $self->{in_headers__} == 0 ) && ( $self->{first20count__} < 20 ) ) { $self->{first20count__} += 1; --- 671,678 ---- # the second char. ! # In Japanese, one character words are common, so care about ! # words between 2 and 45 characters ! ! while ( $line =~ s/^$euc_jp*?([A-Za-z][A-Za-z\']{2,44}|$non_symbol_euc_jp{2,45})(?:[_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)//o ) { if ( ( $self->{in_headers__} == 0 ) && ( $self->{first20count__} < 20 ) ) { $self->{first20count__} += 1; *************** *** 654,665 **** } ! my $matched_word = $1; ! ! # In Japanese, 2 characters words are common, so care about ! # words between 2 and 45 characters ! ! if (((length $matched_word >= 3) && ($matched_word =~ /[A-Za-z]/)) || ((length $matched_word >= 2) && ($matched_word =~ /$non_symbol_euc_jp/))) { ! update_word($self, $matched_word, $encoded, '', '[_\-,\.\"\'\)\?!:;\/ &\t\n\r]'."|$symbol_euc_jp", $prefix); ! } } } else { --- 680,684 ---- } ! update_word($self, $1, $encoded, '', '[_\-,\.\"\'\)\?!:;\/ &\t\n\r]|'.$symbol_euc_jp, $prefix); } } else { *************** *** 1540,1548 **** $self->{charset__} = ''; - # Since Text::Kakasi is not thread-safe, we use it under the - # control of a Mutex to avoid a crash if we are running on - # Windows. - if ( $self->{lang__} eq 'Nihongo' ) { if ( $self->{need_kakasi_mutex__} ) { require POPFile::Mutex; --- 1559,1567 ---- $self->{charset__} = ''; if ( $self->{lang__} eq 'Nihongo' ) { + + # Since Text::Kakasi is not thread-safe, we use it under the + # control of a Mutex to avoid a crash if we are running on + # Windows. if ( $self->{need_kakasi_mutex__} ) { require POPFile::Mutex; *************** *** 1550,1555 **** } ! # Open Kakasi dictionary and initialize ! init_kakasi(); } } --- 1569,1574 ---- } ! # Initialize Nihongo (Japanese) parser ! $self->{nihongo_parser__}{init}( $self ); } } *************** *** 1590,1595 **** if ( $self->{lang__} eq 'Nihongo' ) { ! # Close Kakasi dictionary ! close_kakasi(); if ( $self->{need_kakasi_mutex__} ) { --- 1609,1614 ---- if ( $self->{lang__} eq 'Nihongo' ) { ! # Close Nihongo (Japanese) parser ! $self->{nihongo_parser__}{close}( $self ); if ( $self->{need_kakasi_mutex__} ) { *************** *** 1628,1634 **** if ( !$self->{in_headers__} && $self->{encoding__} =~ /quoted\-printable/i) { if ( $self->{lang__} eq 'Nihongo') { ! if ( $line =~ /=\r\n$/ ) { # Encoded in multiple lines - $line =~ s/=\r\n$//g; $self->{prev__} .= $line; next; --- 1647,1652 ---- if ( !$self->{in_headers__} && $self->{encoding__} =~ /quoted\-printable/i) { if ( $self->{lang__} eq 'Nihongo') { ! if ( $line =~ s/=\r\n$// ) { # Encoded in multiple lines $self->{prev__} .= $line; next; *************** *** 1642,1653 **** } ! if ( $self->{lang__} eq 'Nihongo' ) { # Decode \x?? ! if ( !$self->{in_headers__} ) { ! $line =~ s/\\x([8-9A-F][A-F0-9])/pack("C", hex($1))/eig; ! } $line = convert_encoding( $line, $self->{charset__}, 'euc-jp', '7bit-jis', @{$encoding_candidates{$self->{lang__}}} ); ! $line = parse_line_with_kakasi( $self, $line ); } --- 1660,1669 ---- } ! if ( $self->{lang__} eq 'Nihongo' && !$self->{in_headers__} && $self->{encoding__} !~ /base64/i ) { # Decode \x?? ! $line =~ s/\\x([8-9A-F][A-F0-9])/pack("C", hex($1))/eig; $line = convert_encoding( $line, $self->{charset__}, 'euc-jp', '7bit-jis', @{$encoding_candidates{$self->{lang__}}} ); ! $line = $self->{nihongo_parser__}{parse}( $self, $line ); } *************** *** 1814,1818 **** if ( $self->{lang__} eq 'Nihongo' ) { $decoded = convert_encoding( $decoded, $self->{charset__}, 'euc-jp', '7bit-jis', @{$encoding_candidates{$self->{lang__}}} ); ! $decoded = parse_line_with_kakasi( $self, $decoded ); } --- 1830,1834 ---- if ( $self->{lang__} eq 'Nihongo' ) { $decoded = convert_encoding( $decoded, $self->{charset__}, 'euc-jp', '7bit-jis', @{$encoding_candidates{$self->{lang__}}} ); ! $decoded = $self->{nihongo_parser__}{parse}( $self, $decoded ); } *************** *** 2033,2039 **** if ( $self->{subject__} eq '' ) { ! # In Japanese mode, parse subject with kakasi ! ! $argument = parse_line_with_kakasi( $self, $argument ) if ( $self->{lang__} eq 'Nihongo' && $argument ne '' ); $self->{subject__} = $argument; --- 2049,2054 ---- if ( $self->{subject__} eq '' ) { ! # In Japanese mode, parse subject with Nihongo (Japanese) parser ! $argument = $self->{nihongo_parser__}{parse}( $self, $argument ) if ( $self->{lang__} eq 'Nihongo' && $argument ne '' ); $self->{subject__} = $argument; *************** *** 2383,2386 **** --- 2398,2404 ---- print "Add filename $filename\n" if $self->{debug__}; + # Decode the filename + $filename = $self->decode_string( $filename ); + my ( $name, $ext ) = $self->file_extension( $filename ); *************** *** 2492,2496 **** if(ref $enc){ ! $from= $enc->name; } else { --- 2510,2514 ---- if(ref $enc){ ! $from= $enc->name; } else { *************** *** 2510,2514 **** # Workaround for Encode::Unicode error bug. eval { ! Encode::from_to($string, $from, $to); }; $string = $orig_string if ($@); --- 2528,2536 ---- # Workaround for Encode::Unicode error bug. eval { ! if (ref $enc) { ! $string = Encode::encode($to, $enc->decode($string)); ! } else { ! Encode::from_to($string, $from, $to); ! } }; $string = $orig_string if ($@); *************** *** 2537,2543 **** return $line if ( $line =~ /^[\x00-\x7F]*$/ ); - # This is used to parse Japanese - require Text::Kakasi; - # Split Japanese line into words using Kakasi Wakachigaki mode $line = Text::Kakasi::do_kakasi($line); --- 2559,2562 ---- *************** *** 2548,2551 **** --- 2567,2622 ---- # ---------------------------------------------------------------------------- # + # parse_line_with_mecab + # + # Parse a line with MeCab + # + # Split Japanese words by spaces using "MeCab" - Yet Another Part-of-Speech + # and Morphological Analyzer. + # + # $line The line to be parsed + # + # ---------------------------------------------------------------------------- + sub parse_line_with_mecab + { + my ( $self, $line ) = @_; + + # If the line does not contain Japanese characters, do nothing + return $line if ( $line =~ /^[\x00-\x7F]*$/ ); + + # Split Japanese line into words using MeCab + $line = $self->{nihongo_parser__}{obj_mecab}->parse($line); + + # Remove the unnecessary white spaces + $line =~ s/([\x00-\x1f\x21-\x7f]) (?=[\x00-\x1f\x21-\x7f])/$1/g; + + return $line; + } + + # ---------------------------------------------------------------------------- + # + # parse_line_with_internal_parser + # + # Parse a line with an internal perser + # + # Split characters by kind of the character + # + # $line The line to be parsed + # + # ---------------------------------------------------------------------------- + sub parse_line_with_internal_parser + { + my ( $self, $line ) = @_; + + # If the line does not contain Japanese characters, do nothing + return $line if ( $line =~ /^[\x00-\x7F]*$/ ); + + # Split Japanese line into words by the kind of characters + $line =~ s/\G$euc_jp_word/$1 /og; + + return $line; + } + + # ---------------------------------------------------------------------------- + # # init_kakasi # *************** *** 2555,2565 **** sub init_kakasi { - require Text::Kakasi; - # Initialize Kakasi with Wakachigaki mode(-w is passed to # Kakasi as argument). Both input and ouput encoding are # EUC-JP. ! Text::Kakasi::getopt_argv("kakasi", "-w", "-ieuc", "-oeuc"); } --- 2626,2652 ---- sub init_kakasi { # Initialize Kakasi with Wakachigaki mode(-w is passed to # Kakasi as argument). Both input and ouput encoding are # EUC-JP. ! Text::Kakasi::getopt_argv('kakasi', '-w', '-ieuc', '-oeuc'); ! } ! ! # ---------------------------------------------------------------------------- ! # ! # init_mecab ! # ! # Create a new parser object of MeCab. ! # ! # ---------------------------------------------------------------------------- ! sub init_mecab ! { ! my ( $self ) = @_; ! ! # Initialize MeCab (-F %M\s -U %M\s -E \n is passed to MeCab as argument). ! # Insert white spaces after words. ! ! $self->{nihongo_parser__}{obj_mecab} ! = MeCab::Tagger->new('-F %M\s -U %M\s -E \n'); } *************** *** 2573,2581 **** sub close_kakasi { - require Text::Kakasi; - Text::Kakasi::close_kanwadict(); } 1; --- 2660,2753 ---- sub close_kakasi { Text::Kakasi::close_kanwadict(); } + # ---------------------------------------------------------------------------- + # + # close_mecab + # + # Free the parser object of MeCab. + # + # ---------------------------------------------------------------------------- + sub close_mecab + { + my ( $self ) = @_; + + $self->{nihongo_parser__}{obj_mecab} = undef; + } + + # ---------------------------------------------------------------------------- + # + # setup_nihongo_parser + # + # Check whether Nihongo (Japanese) parsers are available and setup subroutines. + # + # $nihongo_parser Nihongo (Japanese) parser to use + # ( kakasi / mecab / internal ) + # + # ---------------------------------------------------------------------------- + sub setup_nihongo_parser + { + my ( $self, $nihongo_parser ) = @_; + + # If MeCab is installed, use MeCab. + if ( $nihongo_parser eq 'mecab' ) { + my $has_mecab = 0; + + foreach my $prefix (@INC) { + my $realfilename = "$prefix/MeCab.pm"; + if (-f $realfilename) { + $has_mecab = 1; + last; + } + } + + # If MeCab is not installed, try to use Text::Kakasi. + $nihongo_parser = 'kakasi' unless ( $has_mecab ); + } + + # If Text::Kakasi is installed, use Text::Kakasi. + if ( $nihongo_parser eq 'kakasi' ) { + my $has_kakasi = 0; + + foreach my $prefix (@INC) { + my $realfilename = "$prefix/Text/Kakasi.pm"; + if (-f $realfilename) { + $has_kakasi = 1; + last; + } + } + + # If Kakasi is not installed, use the internal parser. + $nihongo_parser = 'internal' unless ( $has_kakasi ); + } + + # Setup perser's subroutines + if ( $nihongo_parser eq 'mecab' ) { + # Import MeCab module + require MeCab; + import MeCab; + + $self->{nihongo_parser__}{init} = \&init_mecab; + $self->{nihongo_parser__}{parse} = \&parse_line_with_mecab; + $self->{nihongo_parser__}{close} = \&close_mecab; + } elsif ( $nihongo_parser eq 'kakasi' ) { + # Import Text::Kakasi module + require Text::Kakasi; + import Text::Kakasi; + + $self->{nihongo_parser__}{init} = \&init_kakasi; + $self->{nihongo_parser__}{parse} = \&parse_line_with_kakasi; + $self->{nihongo_parser__}{close} = \&close_kakasi; + } else { + # Require no external modules + $self->{nihongo_parser__}{init} = sub {}; # Needs no initialization + $self->{nihongo_parser__}{parse} = \&parse_line_with_internal_parser; + $self->{nihongo_parser__}{close} = sub {}; + } + + return $nihongo_parser; + } + 1; |