Update of /cvsroot/openinteract/OpenInteract/pkg/full_text/OpenInteract
In directory usw-pr-cvs1:/tmp/cvs-serv5126/OpenInteract
Modified Files:
FullText.pm
Log Message:
updated OI/FullText.pm to handle scalar refs and filehandles;
indexable text is passed back as a scalar ref now, and you can
override the fields to index by passing in 'fulltext_field' to the
save() call
Index: FullText.pm
===================================================================
RCS file: /cvsroot/openinteract/OpenInteract/pkg/full_text/OpenInteract/FullText.pm,v
retrieving revision 1.9
retrieving revision 1.10
diff -C2 -d -r1.9 -r1.10
*** FullText.pm 2001/10/01 17:13:51 1.9
--- FullText.pm 2001/10/29 13:07:32 1.10
***************
*** 145,149 ****
# to the subclass and send it on up the line
! sub ruleset_add {
my ( $class, $rs_table ) = @_;
my $obj_class = ref $class || $class;
--- 145,149 ----
# to the subclass and send it on up the line
! sub ruleset_factory {
my ( $class, $rs_table ) = @_;
my $obj_class = ref $class || $class;
***************
*** 163,168 ****
my $R = OpenInteract::Request->instance;
$R->DEBUG && $R->scrib( 1, "Trying to index ", ref $self, " (", $self->id, ")" );
! my $indexable = $self->_indexable_object_text;
! $R->DEBUG && $R->scrib( 2, "Indexable text: ", $indexable );
my $wc = $self->_tokenize( $indexable );
$R->DEBUG && $R->scrib( 2, "Found the following tokens:", Dumper( $wc ) );
--- 163,168 ----
my $R = OpenInteract::Request->instance;
$R->DEBUG && $R->scrib( 1, "Trying to index ", ref $self, " (", $self->id, ")" );
! my $indexable = $self->_indexable_object_text( $p );
! $R->DEBUG && $R->scrib( 2, "Indexable text: ", $$indexable );
my $wc = $self->_tokenize( $indexable );
$R->DEBUG && $R->scrib( 2, "Found the following tokens:", Dumper( $wc ) );
***************
*** 201,215 ****
# Get the fields that should be indexed and join the values together
# with a space (easy), since we're just going to index all the text as
! # one big field
sub _indexable_object_text {
! my ( $self ) = @_;
my $R = OpenInteract::Request->instance;
! my $field_list = $self->CONFIG->{fulltext_field};
unless ( ref $field_list eq 'ARRAY' ) {
$R->scrib( 0, "Cannot index object text -- no fields presented in config file." );
return undef;
}
! return join ' ', map { $self->{$_} } @{ $field_list };
}
--- 201,231 ----
# Get the fields that should be indexed and join the values together
# with a space (easy), since we're just going to index all the text as
! # one big field. Returns a scalar REF.
sub _indexable_object_text {
! my ( $self, $p ) = @_;
! $p ||= {};
my $R = OpenInteract::Request->instance;
!
! my $field_list = $p->{fulltext_field} || $self->CONFIG->{fulltext_field};
unless ( ref $field_list eq 'ARRAY' ) {
$R->scrib( 0, "Cannot index object text -- no fields presented in config file." );
return undef;
}
! my ( $indexable );
! foreach my $field ( @{ $field_list } ) {
! if ( ! ref $self->{ $field } ) {
! $indexable = join( ' ', $indexable, $self->{ $field } );
! }
! elsif ( ref $self->{ $field } eq 'SCALAR' ) {
! $indexable = join( ' ', $indexable, $$self->{ $field } );
! }
!
! else {
! my $fh = $self->{ $field };
! $indexable = join( ' ', $indexable, <$fh> );
! }
! }
! return \$indexable;
}
***************
*** 217,228 ****
# Break up the text into tokens -- stemmed using Lingua::Stem and
# counted for occurrences. Remove the words that are too long, too
! # short and those that are found in our STOPWORDS listing.
sub _tokenize {
! my ( $self, $text ) = @_;
! $text =~ tr/A-Z/a-z/; # lowercase
my %words = ();
! map { $words{ $_ }++ } map { Lingua::Stem::stem( $_ )->[0] } ( $text =~ /\w+/g );
! map { delete $words{ $_ } }
grep { length $_ < MIN_WORD_LENGTH || length $_ > MAX_WORD_LENGTH }
keys %words;
--- 233,245 ----
# Break up the text into tokens -- stemmed using Lingua::Stem and
# counted for occurrences. Remove the words that are too long, too
! # short and those that are found in our STOPWORDS listing. Takes a
! # scalar REF as an argument.
sub _tokenize {
! my ( $self, $text_ref ) = @_;
! $$text_ref =~ tr/A-Z/a-z/; # lowercase
my %words = ();
! map { $words{ $_ }++ } map { Lingua::Stem::stem( $_ )->[0] } ( $$text_ref =~ /\w+/g );
! map { delete $words{ $_ } }
grep { length $_ < MIN_WORD_LENGTH || length $_ > MAX_WORD_LENGTH }
keys %words;
***************
*** 522,529 ****
(From: http://ollie.dcccd.edu/library/Module2/Books/concepts.htm)
! We use the L<Lingua::Stem> module for this, which implements the
! I<Porter algorithm> for stemming, as do most implementations,
! apparently. (This is something that I<OpenInteract::FullText> treats as a
! black box itself :)
Parameters:
--- 539,546 ----
(From: http://ollie.dcccd.edu/library/Module2/Books/concepts.htm)
! We use the L<Lingua::Stem|Lingua::Stem> module for this, which
! implements the I<Porter algorithm> for stemming, as do most
! implementations, apparently. (This is something that this class treats
! as a black box itself :)
Parameters:
|