From: naoki i. <am...@us...> - 2005-09-17 05:40:38
|
Update of /cvsroot/popfile/engine/Classifier In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv16088/Classifier Modified Files: Tag: b0_22_2 MailParse.pm Bayes.pm Log Message: Following Japanese support: - Support Base64-encoded Japanese mail. - Performance update for Kakasi wakachi-gaki. - Performance update for Mutex with Text-Kakasi. - Fix broken euc-jp code on UI (history tab). Index: Bayes.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v retrieving revision 1.327.4.2 retrieving revision 1.327.4.3 diff -C2 -d -r1.327.4.2 -r1.327.4.3 *** Bayes.pm 16 Sep 2005 19:20:26 -0000 1.327.4.2 --- Bayes.pm 17 Sep 2005 05:40:30 -0000 1.327.4.3 *************** *** 339,342 **** --- 339,361 ---- } + # Since Text::Kakasi is not thread-safe, we use it under the + # control of a Mutex to avoid a crash if we are running on + # Windows and using the fork. + + if ( ( $self->{parser__}->{lang__} eq 'Nihongo' ) && ( $^O eq 'MSWin32' ) && + ( ( ( $self->module_config_( 'pop3', 'enabled' ) ) && + ( $self->module_config_( 'pop3', 'force_fork' ) ) ) || + ( ( $self->module_config_( 'nntp', 'enabled' ) ) && + ( $self->module_config_( 'nntp', 'force_fork' ) ) ) || + ( ( $self->module_config_( 'smtp', 'enabled' ) ) && + ( $self->module_config_( 'smtp', 'force_fork' ) ) ) ) ) { + $self->{parser__}->{need_kakasi_mutex__} = 1; + + # Prepare the Mutex. + require POPFile::Mutex; + $self->{parser__}->{kakasi_mutex__} = new POPFile::Mutex( 'mailparse_kakasi' ); + $self->log_( 2, "Create mutex for Kakasi." ); + } + $self->upgrade_predatabase_data__(); Index: MailParse.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v retrieving revision 1.216 retrieving revision 1.216.4.1 diff -C2 -d -r1.216 -r1.216.4.1 *** MailParse.pm 8 Dec 2004 01:45:29 -0000 1.216 --- MailParse.pm 17 Sep 2005 05:40:28 -0000 1.216.4.1 *************** *** 1539,1542 **** --- 1539,1556 ---- $self->{colorized__} = ''; $self->{colorized__} .= "<tt>" if ( $self->{color__} ne '' ); + + # Since Text::Kakasi is not thread-safe, we use it under the + # control of a Mutex to avoid a crash if we are running on + # Windows. + + if ( $self->{lang__} eq 'Nihongo' ) { + if ( $self->{need_kakasi_mutex__} ) { + require POPFile::Mutex; + $self->{kakasi_mutex__}->acquire(); + } + + # Open Kakasi dictionary and initialize + init_kakasi(); + } } *************** *** 1574,1577 **** --- 1588,1601 ---- $self->{in_html_tag__} = 0; + + if ( $self->{lang__} eq 'Nihongo' ) { + # Close Kakasi dictionary + close_kakasi(); + + if ( $self->{need_kakasi_mutex__} ) { + require POPFile::Mutex; + $self->{kakasi_mutex__}->release(); + } + } } *************** *** 1703,1706 **** --- 1727,1733 ---- print "Hit MIME boundary --$1\n" if $self->{debug__}; + # Decode base64 for every part. + $self->{colorized__} .= $self->clear_out_base64() . "\n\n"; + $self->{in_headers__} = 1; } else { *************** *** 1783,1786 **** --- 1810,1819 ---- $decoded = decode_base64( $self->{base64__} ); + + if ( $self->{lang__} eq 'Nihongo' ) { + $decoded = convert_encoding( $decoded, $self->{charset__}, 'euc-jp', '7bit-jis', @{$encoding_candidates{$self->{lang__}}} ); + $decoded = parse_line_with_kakasi( $self, $decoded ); + } + $self->parse_html( $decoded, 1 ); *************** *** 2496,2534 **** my ( $self, $line ) = @_; # This is used to parse Japanese require Text::Kakasi; ! # Split Japanese line into words using Kakasi Wakachigaki ! # mode(-w is passed to Kakasi as argument). Both input and ouput ! # encoding are EUC-JP. ! # ! # Since Text::Kakasi is not thread-safe, we use it under the ! # control of a semaphore to avoid a crash if we are running on ! # Windows in a forked process. ! # ! # Note that this requires us to detect a sub-process by looking at ! # the value of $$. In ActivePerl a negative PID is in a ! # sub-process If this were to change then this code would not ! # work. ! my $need_semaphore = ( ( $^O eq 'MSWin32' ) && ( $$ < 0 ) ); ! if ( $need_semaphore ) { ! if ( !defined( $self->{mutex__} ) ) { ! require POPFile::Mutex; ! $self->{mutex__} = new POPFile::Mutex( 'mailparse_kakasi' ); ! } ! $self->{mutex__}->acquire(); ! } ! Text::Kakasi::getopt_argv("kakasi", "-w -ieuc -oeuc"); ! $line = Text::Kakasi::do_kakasi($line); ! Text::Kakasi::close_kanwadict(); ! if ( $need_semaphore ) { ! $self->{mutex__}->release(); ! } ! return $line; } --- 2529,2574 ---- my ( $self, $line ) = @_; + # If the line does not contain Japanese characters, do nothing + return $line if ( $line =~ /^[\x00-\x7F]*$/ ); + # This is used to parse Japanese require Text::Kakasi; ! # Split Japanese line into words using Kakasi Wakachigaki mode ! $line = Text::Kakasi::do_kakasi($line); ! return $line; ! } ! # ---------------------------------------------------------------------------- ! # ! # init_kakasi ! # ! # Open the kanwa dictionary and initialize the parameter of Kakasi. ! # ! # ---------------------------------------------------------------------------- ! sub init_kakasi ! { ! require Text::Kakasi; ! # Initialize Kakasi with Wakachigaki mode(-w is passed to ! # Kakasi as argument). Both input and ouput encoding are ! # EUC-JP. ! Text::Kakasi::getopt_argv("kakasi", "-w", "-ieuc", "-oeuc"); ! } ! # ---------------------------------------------------------------------------- ! # ! # close_kakasi ! # ! # Close the kanwa dictionary of Kakasi. ! # ! # ---------------------------------------------------------------------------- ! sub close_kakasi ! { ! require Text::Kakasi; ! ! Text::Kakasi::close_kanwadict(); } |