From: naoki i. <am...@us...> - 2007-11-27 14:44:58
|
Update of /cvsroot/popfile/engine/Classifier In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26968/Classifier Modified Files: Bayes.pm MailParse.pm Log Message: Japanese users now choose the Japanese parser by changing 'bayes_nihongo_parser' option. Performance improvement for Japanese users. Decode the encoded attatchment file names. Index: Bayes.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v retrieving revision 1.353 retrieving revision 1.354 diff -C2 -d -r1.353 -r1.354 *** Bayes.pm 26 Nov 2007 15:22:48 -0000 1.353 --- Bayes.pm 27 Nov 2007 14:44:56 -0000 1.354 *************** *** 230,233 **** --- 230,238 ---- $self->config_( 'localhostname', '' ); + # Japanese wakachigaki parser ('kakasi' or 'mecab' or 'internal'). + # TODO: Users can choose the parser engine to use? + + $self->config_( 'nihongo_parser', 'kakasi' ); + $self->mq_register_( 'COMIT', $self ); $self->mq_register_( 'RELSE', $self ); *************** *** 310,326 **** # Windows and using the fork. ! if ( ( $self->{parser__}->{lang__} eq 'Nihongo' ) && ( $^O eq 'MSWin32' ) && ! ( ( ( $self->user_module_config_( 1, 'pop3', 'enabled' ) ) && ! ( $self->user_module_config_( 1, 'pop3', 'force_fork' ) ) ) || ! ( ( $self->user_module_config_( 1, 'nntp', 'enabled' ) ) && ! ( $self->user_module_config_( 1, 'nntp', 'force_fork' ) ) ) || ! ( ( $self->user_module_config_( 1, 'smtp', 'enabled' ) ) && ! ( $self->user_module_config_( 1, 'smtp', 'force_fork' ) ) ) ) ) { ! $self->{parser__}->{need_kakasi_mutex__} = 1; ! # Prepare the Mutex. ! require POPFile::Mutex; ! $self->{parser__}->{kakasi_mutex__} = new POPFile::Mutex( 'mailparse_kakasi' ); ! $self->log_( 2, "Create mutex for Kakasi." ); } --- 315,347 ---- # Windows and using the fork. ! if ( $self->{parser__}->{lang__} eq 'Nihongo' ) { ! # Setup Nihongo (Japanese) parser. ! my $nihongo_parser = $self->config_( 'nihongo_parser' ); ! ! $nihongo_parser = $self->{parser__}->setup_nihongo_parser( $nihongo_parser ); ! ! $self->log_( 2, "Use Nihongo (Japanese) parser : $nihongo_parser" ); ! $self->config_( 'nihongo_parser', $nihongo_parser ); ! ! # Since Text::Kakasi is not thread-safe, we use it under the ! # control of a Mutex to avoid a crash if we are running on ! # Windows and using the fork. ! ! if ( ( $nihongo_parser eq 'kakasi' ) && ( $^O eq 'MSWin32' ) && ! ( ( ( $self->user_module_config_( 1, 'pop3', 'enabled' ) ) && ! ( $self->user_module_config_( 1, 'pop3', 'force_fork' ) ) ) || ! ( ( $self->user_module_config_( 1, 'nntp', 'enabled' ) ) && ! ( $self->user_module_config_( 1, 'nntp', 'force_fork' ) ) ) || ! ( ( $self->user_module_config_( 1, 'smtp', 'enabled' ) ) && ! ( $self->user_module_config_( 1, 'smtp', 'force_fork' ) ) ) ) ) { ! ! $self->{parser__}->{need_kakasi_mutex__} = 1; ! ! # Prepare the Mutex. ! require POPFile::Mutex; ! $self->{parser__}->{kakasi_mutex__} = new POPFile::Mutex( 'mailparse_kakasi' ); ! $self->log_( 2, "Create mutex for Kakasi." ); ! } } Index: MailParse.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v retrieving revision 1.229 retrieving revision 1.230 diff -C2 -d -r1.229 -r1.230 *** MailParse.pm 26 Nov 2007 15:22:48 -0000 1.229 --- MailParse.pm 27 Nov 2007 14:44:56 -0000 1.230 *************** *** 70,73 **** --- 70,92 ---- my $non_symbol_euc_jp = "(?:$non_symbol_two_bytes_euc_jp|$three_bytes_euc_jp|$cho_on_symbol)"; + # Constants for the internal wakachigaki parser. + # Kind of EUC-JP chars + my $euc_jp_symbol = '[\xA1\xA2\xA6-\xA8\xAD\xF9-\xFC][\xA1-\xFE]'; # The symbols make a word of one character. + my $euc_jp_alphanum = '(?:\xA3[\xB0-\xB9\xC1-\xDA\xE1-\xFA])+'; # One or more alphabets and numbers + my $euc_jp_hiragana = '(?:(?:\xA4[\xA1-\xF3])+(?:\xA1[\xAB\xAC\xB5\xB6\xBC])*)+'; # One or more Hiragana characters + my $euc_jp_katakana = '(?:(?:\xA5[\xA1-\xF6])+(?:\xA1[\xA6\xBC\xB3\xB4])*)+'; # One or more Katakana characters + my $euc_jp_hkatakana = '(?:\x8E[\xA6-\xDF])+'; # One or more Half-width Katakana characters + my $euc_jp_kanji = '[\xB0-\xF4][\xA1-\xFE](?:[\xB0-\xF4][\xA1-\xFE]|\xA1\xB9)?'; # One or two Kanji characters + + my $euc_jp_word = '(' . + $euc_jp_alphanum . '|' . + $euc_jp_hiragana . '|' . + $euc_jp_katakana . '|' . + $euc_jp_hkatakana . '|' . + $euc_jp_kanji . '|' . + $euc_jp_symbol . '|' . + $ascii . '+|' . + $three_bytes_euc_jp . ')'; + # HTML entity mapping to character codes, this maps things like & # to their corresponding character code *************** *** 231,234 **** --- 250,256 ---- $self->{prev__} = ''; + # Object for the Nihongo (Japanese) parser. + $self->{nihongo_parser__} = undef; + return $result; } *************** *** 667,671 **** # first char and the first byte of the second char. ! while ( $line =~ s/^$euc_jp*?(([A-Za-z]|$non_symbol_euc_jp)([A-Za-z\']|$non_symbol_euc_jp){1,44})([_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)//ox ) { if ( ( $self->{in_headers__} == 0 ) && ( $self->{first20count__} < 20 ) ) { $self->{first20count__} += 1; --- 689,696 ---- # first char and the first byte of the second char. ! # In Japanese, one character words are common, so care about ! # words between 2 and 45 characters ! ! while ( $line =~ s/^$euc_jp*?([A-Za-z][A-Za-z\']{2,44}|$non_symbol_euc_jp{2,45})(?:[_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)//o ) { if ( ( $self->{in_headers__} == 0 ) && ( $self->{first20count__} < 20 ) ) { $self->{first20count__} += 1; *************** *** 673,684 **** } ! my $matched_word = $1; ! ! # In Japanese, 2 characters words are common, so ! # care about words between 2 and 45 characters ! ! if (((length $matched_word >= 3) && ($matched_word =~ /[A-Za-z]/)) || ((length $matched_word >= 2) && ($matched_word =~ /$non_symbol_euc_jp/))) { ! update_word($self, $matched_word, $encoded, '', '[_\-,\.\"\'\)\?!:;\/ &\t\n\r]'."|$symbol_euc_jp", $prefix); ! } } } else { --- 698,702 ---- } ! update_word($self, $1, $encoded, '', '[_\-,\.\"\'\)\?!:;\/ &\t\n\r]|'.$symbol_euc_jp, $prefix); } } else { *************** *** 1583,1591 **** $self->{charset__} = ''; - # Since Text::Kakasi is not thread-safe, we use it under the - # control of a Mutex to avoid a crash if we are running on - # Windows. - if ( $self->{lang__} eq 'Nihongo' ) { if ( $self->{need_kakasi_mutex__} ) { require POPFile::Mutex; --- 1601,1609 ---- $self->{charset__} = ''; if ( $self->{lang__} eq 'Nihongo' ) { + + # Since Text::Kakasi is not thread-safe, we use it under the + # control of a Mutex to avoid a crash if we are running on + # Windows. if ( $self->{need_kakasi_mutex__} ) { require POPFile::Mutex; *************** *** 1593,1598 **** } ! # Open Kakasi dictionary and initialize ! init_kakasi(); } } --- 1611,1616 ---- } ! # Initialize Nihongo (Japanese) parser ! $self->{nihongo_parser__}{init}( $self ); } } *************** *** 1636,1641 **** if ( $self->{lang__} eq 'Nihongo' ) { ! # Close Kakasi dictionary ! close_kakasi(); if ( $self->{need_kakasi_mutex__} ) { --- 1654,1659 ---- if ( $self->{lang__} eq 'Nihongo' ) { ! # Close Nihongo (Japanese) parser ! $self->{nihongo_parser__}{close}( $self ); if ( $self->{need_kakasi_mutex__} ) { *************** *** 1676,1682 **** if ( !$self->{in_headers__} && $self->{encoding__} =~ /quoted\-printable/i) { if ( $self->{lang__} eq 'Nihongo') { ! if ( $line =~ /=\r\n$/ ) { # Encoded in multiple lines - $line =~ s/=\r\n$//g; $self->{prev__} .= $line; next; --- 1694,1699 ---- if ( !$self->{in_headers__} && $self->{encoding__} =~ /quoted\-printable/i) { if ( $self->{lang__} eq 'Nihongo') { ! if ( $line =~ s/=\r\n$// ) { # Encoded in multiple lines $self->{prev__} .= $line; next; *************** *** 1690,1701 **** } ! if ( $self->{lang__} eq 'Nihongo' ) { # Decode \x?? ! if ( !$self->{in_headers__} ) { ! $line =~ s/\\x([8-9A-F][A-F0-9])/pack("C", hex($1))/eig; ! } $line = convert_encoding( $line, $self->{charset__}, 'euc-jp', '7bit-jis', @{$encoding_candidates{$self->{lang__}}} ); ! $line = parse_line_with_kakasi( $self, $line ); } --- 1707,1716 ---- } ! if ( $self->{lang__} eq 'Nihongo' && !$self->{in_headers__} && $self->{encoding__} !~ /base64/i ) { # Decode \x?? ! $line =~ s/\\x([8-9A-F][A-F0-9])/pack("C", hex($1))/eig; $line = convert_encoding( $line, $self->{charset__}, 'euc-jp', '7bit-jis', @{$encoding_candidates{$self->{lang__}}} ); ! $line = $self->{nihongo_parser__}{parse}( $self, $line ); } *************** *** 1869,1873 **** if ( $self->{lang__} eq 'Nihongo' ) { $decoded = convert_encoding( $decoded, $self->{charset__}, 'euc-jp', '7bit-jis', @{$encoding_candidates{$self->{lang__}}} ); ! $decoded = parse_line_with_kakasi( $self, $decoded ); } --- 1884,1888 ---- if ( $self->{lang__} eq 'Nihongo' ) { $decoded = convert_encoding( $decoded, $self->{charset__}, 'euc-jp', '7bit-jis', @{$encoding_candidates{$self->{lang__}}} ); ! $decoded = $self->{nihongo_parser__}{parse}( $self, $decoded ); } *************** *** 2093,2099 **** if ( $self->{subject__} eq '' ) { ! # In Japanese mode, parse subject with kakasi ! $argument = parse_line_with_kakasi( $self, $argument ) if ( $self->{lang__} eq 'Nihongo' && $argument ne '' ); $self->{subject__} = $argument; --- 2108,2114 ---- if ( $self->{subject__} eq '' ) { ! # In Japanese mode, parse subject with Nihongo (Japanese) parser ! $argument = $self->{nihongo_parser__}{parse}( $self, $argument ) if ( $self->{lang__} eq 'Nihongo' && $argument ne '' ); $self->{subject__} = $argument; *************** *** 2457,2460 **** --- 2472,2478 ---- print "Add filename $filename\n" if $self->{debug__}; + # Decode the filename + $filename = $self->decode_string( $filename ); + my ( $name, $ext ) = $self->file_extension( $filename ); *************** *** 2484,2488 **** my ( $attachment, $filename ) = $self->match_attachment_filename( $params ); ! if ( $attachment eq 'attachment' ) { $self->add_attachment_filename( $filename ) ; } --- 2502,2506 ---- my ( $attachment, $filename ) = $self->match_attachment_filename( $params ); ! if ( defined( $attachment ) && ( $attachment eq 'attachment' ) ) { $self->add_attachment_filename( $filename ) ; } *************** *** 2562,2566 **** if(ref $enc){ ! $from= $enc->name; } else { --- 2580,2584 ---- if(ref $enc){ ! $from = $enc->name; } else { *************** *** 2580,2584 **** # Workaround for Encode::Unicode error bug. eval { ! Encode::from_to($string, $from, $to); }; $string = $orig_string if ($@); --- 2598,2606 ---- # Workaround for Encode::Unicode error bug. eval { ! if (ref $enc) { ! $string = Encode::encode($to, $enc->decode($string)); ! } else { ! Encode::from_to($string, $from, $to); ! } }; $string = $orig_string if ($@); *************** *** 2607,2613 **** return $line if ( $line =~ /^[\x00-\x7F]*$/ ); - # This is used to parse Japanese - require Text::Kakasi; - # Split Japanese line into words using Kakasi Wakachigaki mode $line = Text::Kakasi::do_kakasi($line); --- 2629,2632 ---- *************** *** 2618,2621 **** --- 2637,2692 ---- # ---------------------------------------------------------------------------- # + # parse_line_with_mecab + # + # Parse a line with MeCab + # + # Split Japanese words by spaces using "MeCab" - Yet Another Part-of-Speech + # and Morphological Analyzer. + # + # $line The line to be parsed + # + # ---------------------------------------------------------------------------- + sub parse_line_with_mecab + { + my ( $self, $line ) = @_; + + # If the line does not contain Japanese characters, do nothing + return $line if ( $line =~ /^[\x00-\x7F]*$/ ); + + # Split Japanese line into words using MeCab + $line = $self->{nihongo_parser__}{obj_mecab}->parse($line); + + # Remove the unnecessary white spaces + $line =~ s/([\x00-\x1f\x21-\x7f]) (?=[\x00-\x1f\x21-\x7f])/$1/g; + + return $line; + } + + # ---------------------------------------------------------------------------- + # + # parse_line_with_internal_parser + # + # Parse a line with an internal perser + # + # Split characters by kind of the character + # + # $line The line to be parsed + # + # ---------------------------------------------------------------------------- + sub parse_line_with_internal_parser + { + my ( $self, $line ) = @_; + + # If the line does not contain Japanese characters, do nothing + return $line if ( $line =~ /^[\x00-\x7F]*$/ ); + + # Split Japanese line into words by the kind of characters + $line =~ s/\G$euc_jp_word/$1 /og; + + return $line; + } + + # ---------------------------------------------------------------------------- + # # init_kakasi # *************** *** 2625,2635 **** sub init_kakasi { - require Text::Kakasi; - # Initialize Kakasi with Wakachigaki mode(-w is passed to # Kakasi as argument). Both input and ouput encoding are # EUC-JP. ! Text::Kakasi::getopt_argv("kakasi", "-w", "-ieuc", "-oeuc"); } --- 2696,2722 ---- sub init_kakasi { # Initialize Kakasi with Wakachigaki mode(-w is passed to # Kakasi as argument). Both input and ouput encoding are # EUC-JP. ! Text::Kakasi::getopt_argv('kakasi', '-w', '-ieuc', '-oeuc'); ! } ! ! # ---------------------------------------------------------------------------- ! # ! # init_mecab ! # ! # Create a new parser object of MeCab. ! # ! # ---------------------------------------------------------------------------- ! sub init_mecab ! { ! my ( $self ) = @_; ! ! # Initialize MeCab (-F %M\s -U %M\s -E \n is passed to MeCab as argument). ! # Insert white spaces after words. ! ! $self->{nihongo_parser__}{obj_mecab} ! = MeCab::Tagger->new('-F %M\s -U %M\s -E \n'); } *************** *** 2643,2651 **** sub close_kakasi { - require Text::Kakasi; - Text::Kakasi::close_kanwadict(); } 1; --- 2730,2823 ---- sub close_kakasi { Text::Kakasi::close_kanwadict(); } + # ---------------------------------------------------------------------------- + # + # close_mecab + # + # Free the parser object of MeCab. + # + # ---------------------------------------------------------------------------- + sub close_mecab + { + my ( $self ) = @_; + + $self->{nihongo_parser__}{obj_mecab} = undef; + } + + # ---------------------------------------------------------------------------- + # + # setup_nihongo_parser + # + # Check whether Nihongo (Japanese) parsers are available and setup subroutines. + # + # $nihongo_parser Nihongo (Japanese) parser to use + # ( kakasi / mecab / internal ) + # + # ---------------------------------------------------------------------------- + sub setup_nihongo_parser + { + my ( $self, $nihongo_parser ) = @_; + + # If MeCab is installed, use MeCab. + if ( $nihongo_parser eq 'mecab' ) { + my $has_mecab = 0; + + foreach my $prefix (@INC) { + my $realfilename = "$prefix/MeCab.pm"; + if (-f $realfilename) { + $has_mecab = 1; + last; + } + } + + # If MeCab is not installed, try to use Text::Kakasi. + $nihongo_parser = 'kakasi' unless ( $has_mecab ); + } + + # If Text::Kakasi is installed, use Text::Kakasi. + if ( $nihongo_parser eq 'kakasi' ) { + my $has_kakasi = 0; + + foreach my $prefix (@INC) { + my $realfilename = "$prefix/Text/Kakasi.pm"; + if (-f $realfilename) { + $has_kakasi = 1; + last; + } + } + + # If Kakasi is not installed, use the internal parser. + $nihongo_parser = 'internal' unless ( $has_kakasi ); + } + + # Setup perser's subroutines + if ( $nihongo_parser eq 'mecab' ) { + # Import MeCab module + require MeCab; + import MeCab; + + $self->{nihongo_parser__}{init} = \&init_mecab; + $self->{nihongo_parser__}{parse} = \&parse_line_with_mecab; + $self->{nihongo_parser__}{close} = \&close_mecab; + } elsif ( $nihongo_parser eq 'kakasi' ) { + # Import Text::Kakasi module + require Text::Kakasi; + import Text::Kakasi; + + $self->{nihongo_parser__}{init} = \&init_kakasi; + $self->{nihongo_parser__}{parse} = \&parse_line_with_kakasi; + $self->{nihongo_parser__}{close} = \&close_kakasi; + } else { + # Require no external modules + $self->{nihongo_parser__}{init} = sub {}; # Needs no initialization + $self->{nihongo_parser__}{parse} = \&parse_line_with_internal_parser; + $self->{nihongo_parser__}{close} = sub {}; + } + + return $nihongo_parser; + } + 1; |