|
From: <jgr...@us...> - 2003-10-28 20:11:16
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv17979/Classifier
Modified Files:
Bayes.pm
Log Message:
Fix bug 826765
@ and $ inside magnets were not being handled properly.
Classifer/Bayes.pm:
Factor most of magnet_match__ into magnet_match_helper__ so
that there is no duplicated code. Remove use of regexps for
magnet match and replace with simple 'eq' matching, thus
eliminating all the complexities around special characters
in regexps and the fact that @ and $ are illegal in \Q \E
quoted regexps.
tests/TestBayes.tst:
Added tests for magnet_match__ with specific emphasis on
handling of $ and @.
Made Japanese tests detect whether Text::Kakasi is present
on the machine and ignore them (with a warning if it is
not present).
Index: Bayes.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/Bayes.pm,v
retrieving revision 1.216
retrieving revision 1.217
diff -C2 -d -r1.216 -r1.217
*** Bayes.pm 28 Oct 2003 17:28:02 -0000 1.216
--- Bayes.pm 28 Oct 2003 19:39:48 -0000 1.217
***************
*** 867,876 ****
}
-
# ---------------------------------------------------------------------------------------------
#
! # magnet_match__
#
! # Helper the determines if a specific string matches a certain magnet type in a bucket
#
# $match The string to match
--- 867,876 ----
}
# ---------------------------------------------------------------------------------------------
#
! # magnet_match_helper__
#
! # Helper the determines if a specific string matches a certain magnet type in a bucket, used
! # by magnet_match_
#
# $match The string to match
***************
*** 880,897 ****
# ---------------------------------------------------------------------------------------------
! sub magnet_match__
{
! my ( $self, $noattype, $bucket, $type ) = @_;
! if ( $self->module_config_( 'html', 'language' ) =~ /^Nihongo|Korean$/ ) {
! no locale;
! for my $magnet (sort keys %{$self->{magnets__}{$bucket}{$type}}) {
! my $regex;
! $regex = $magnet;
! $regex =~ s/@/__POPFILE_AT__/g;
! $regex =~ s/\$/__POPFILE_DOLLAR__/g;
! if ( $noattype =~ m/\Q$regex\E/i ) {
$self->{scores__} = '';
$self->{magnet_used__} = 1;
--- 880,894 ----
# ---------------------------------------------------------------------------------------------
! sub magnet_match_helper__
{
! my ( $self, $match, $bucket, $type ) = @_;
! $match = lc($match);
! for my $magnet (sort keys %{$self->{magnets__}{$bucket}{$type}}) {
! $magnet = lc($magnet);
! for my $i (0..(length($match)-length($magnet))) {
! if ( substr( $match, $i, length($magnet)) eq $magnet ) {
$self->{scores__} = '';
$self->{magnet_used__} = 1;
***************
*** 899,923 ****
return 1;
! }
}
! }else{
! for my $magnet (sort keys %{$self->{magnets__}{$bucket}{$type}}) {
! my $regex;
! $regex = $magnet;
! $regex =~ s/@/__POPFILE_AT__/g;
! $regex =~ s/\$/__POPFILE_DOLLAR__/g;
! if ( $noattype =~ m/\Q$regex\E/i ) {
! $self->{scores__} = '';
! $self->{magnet_used__} = 1;
! $self->{magnet_detail__} = "$type: $magnet";
! return 1;
! }
! }
! }
! return 0;
}
--- 896,928 ----
return 1;
! }
}
! }
! return 0;
! }
! # ---------------------------------------------------------------------------------------------
! #
! # magnet_match__
! #
! # Helper the determines if a specific string matches a certain magnet type in a bucket
! #
! # $match The string to match
! # $bucket The bucket to check
! # $type The magnet type to check
! #
! # ---------------------------------------------------------------------------------------------
! sub magnet_match__
! {
! my ( $self, $match, $bucket, $type ) = @_;
! if ( $self->module_config_( 'html', 'language' ) =~ /^Nihongo|Korean$/ ) {
! no locale;
! return $self->magnet_match_helper__( $match, $bucket, $type );
! } else {
! return $self->magnet_match_helper__( $match, $bucket, $type );
! }
}
***************
*** 955,977 ****
for my $bucket (sort keys %{$self->{magnets__}}) {
for my $type (sort keys %{$self->{magnets__}{$bucket}}) {
!
! # You cannot use @ or $ inside a \Q\E regular expression and hence
! # we have to change the $magnet and the text we are comparing against
! # by changing the $ and @ signs to special forms which I hope
! # never really appear
!
! my $noattype;
!
! $noattype = $self->{parser__}->get_header($type);
! $noattype =~ s/@/__POPFILE_AT__/g;
! $noattype =~ s/\$/__POPFILE_DOLLAR__/g;
!
! # In Japanese mode, disable locale.
! # Sorting Japanese with "use locale" is memory and time consuming,
! # and may cause perl crash.
!
! # Disable the locale in Korean mode, too.
!
! return $bucket if ( $self->magnet_match__( $noattype, $bucket, $type ) );
}
}
--- 960,966 ----
for my $bucket (sort keys %{$self->{magnets__}}) {
for my $type (sort keys %{$self->{magnets__}{$bucket}}) {
! if ( $self->magnet_match__( $self->{parser__}->get_header($type), $bucket, $type ) ) {
! return $bucket;
! }
}
}
|