#1 (beta) Lucene search algorithm
I have finally got the time to finish coding beta
version of the
Lucene searching algorithm for FullTextSearch module.
The patch file and test scripts should be found inside
text_search.rar archive file. The archive should contain
these files:
FullTextSearch.patch -- patch file (using
WinPatchMaker-1.0)
index_search.conf -- configuration file for my index
searcher test script.
index_search_init.pl -- run this first to initialize
index tables etc.
index_search_test.pl -- the main test file (contains a
few test cases and allows you to easily add your own).
Since this is only a 'beta' release of the algorithm
implementation,
apply the patch against a copy of the FullTextSearch
module.
In this release, scoring has been implemented for the
phrase
backend only. Also, for now I assume numerical
document ids (rather
than strings as whould be the case with the String
backend?) only.
Adding scoring to other backends shouldn't be a hard
task since
all major scoring routines are located in the main
FullTextSearch.pm
module. There's actually only a single subroutine that
has to be
invoked from other backend modules in order to enable
scoring for them.
I hope you'll find inline documentation useful.
At this stage it is crucial that we get
comments/suggestions/brilliant
ideas flowing in. ;-). Please, post your thoughts to
the devel mailing list or in the forums.
Cheers,
Vlad.
beta patch + test scripts.
Logged In: YES
user_id=498663
-------- PATCH: ------------------------------------------
Only in D:\vlad\programming\perl\lib_dev\DBIx: #DEV_NOTES#
Only in D:\vlad\programming\perl\lib_dev\DBIx: DEV_NOTES
Only in D:\vlad\programming\perl\lib_dev\DBIx: DEV_NOTES~
diff -ur --exclude=CVS
D:\vlad\lib\perl\DBIx/FullTextSearch/Phrase.pm
D:\vlad\programming\perl\lib_dev\DBIx/FullTextSearch/Phrase.pm
--- D:\vlad\lib\perl\DBIx/FullTextSearch/Phrase.pm Sat Feb 23 01:32:38 2002
+++ D:\vlad\programming\perl\lib_dev\DBIx/FullTextSearch/Phrase.pm Thu Apr 18 14:32:18 2002
@@ -113,6 +113,8 @@
};
my $out = {};
+#
$DB::single = 1;
+
for my $phrase (@_){
@@ -151,14 +153,22 @@
-
}
+
}
-
my @positions = keys %{$cur_pos{$doc}};
-
$out->{$doc} += scalar (@positions);
+
my @positions = keys %{$cur_pos{$doc}};
+
$out->{$doc} += scalar (@positions);
+
+
$fts->_update_term_score(
+
term => $phrase,
+
count => scalar (@positions),
+
docid => $doc,
+
);
+
+
}
Only in
D:\vlad\programming\perl\lib_dev\DBIx/FullTextSearch: Phrase.pm~
diff -ur --exclude=CVS
D:\vlad\lib\perl\DBIx/FullTextSearch.pm
D:\vlad\programming\perl\lib_dev\DBIx/FullTextSearch.pm
--- D:\vlad\lib\perl\DBIx/FullTextSearch.pm Sat Feb 23 04:42:24 2002
+++ D:\vlad\programming\perl\lib_dev\DBIx/FullTextSearch.pm Sat Apr 20 16:21:00 2002
@@ -366,6 +366,7 @@
my @words = eval $filter;
@words = grep !$stoplist->is_stop_word($_), @words if
defined($stoplist);
@words = @{$stemmer->stem(@words)} if defined($stemmer);
+
for my $word ( @words ) {
@@ -423,14 +424,26 @@
+
+
# calculate frequencies of each term (ak'a phrase) in
+
# this query.
+
my $n_terms = @phrases;
+
$self->{'query_term_f'}{$_} += 1/$n_terms for (@phrases);
+
}
sub contains {
my $self = shift;
-
my $res = $self->contains_hashref(@_);
+
$self->{'query_term_f'} = undef;
+
my $res = $self->contains_hashref(@_);
if (not $self->{'count_bits'}) { return keys %$res; }
-
return sort { $res->{$b} <=> $res->{$a} } keys %$res;
+
+
# term count is possible if count_bits exists, therefore,
+
# calculate document score.
+
return $self->calculate_doc_scores();
+
+
#return sort { $res->{$b} <=> $res->{$a} } keys %$res;
}
sub econtains_hashref {
@@ -482,6 +495,7 @@
sub econtains {
my $self = shift;
+
$self->{'query_term_f'} = undef;
my $res = $self->econtains_hashref(@_);
if (not $self->{'count_bits'}) { return keys %$res; }
return sort { $res->{$b} <=> $res->{$a} } keys %$res;
@@ -548,6 +562,158 @@
$self->{'db_backend'}->common_word($k);
}
or die $dbh->errstr);
+}
+
+#
+# this method is called whenever a new term/phrase is found
+# in a document. All data related to this term (number of
+# occurances, id of a document it was found in etc) is then
+# saved in 'score' hash of this object. This score hash
+# will in turn be used to calculate final document scores
+# when the search is complete.
+#
+# %args = (
+# term => phrase or term that was located in a document
+# count => number of times this phrase/term appears in
the document
+# docid => id of the document where the phrase/term was
found.
+# )
+#
+# The score hash has the following structure:
+# example score hash (dump of a sample):
+#
+# DB<1> x $self->{score}
+# 0 HASH(0x34012a8)
+# 'docs' => HASH(0x3401314)
+# 1 => HASH(0x340132c)
+# 0 => 2
+# 1 => 1
+# 2 => 1
+# 2 => HASH(0x34007a4)
+# 0 => 4
+# 1 => 2
+# 2 => 2
+# 3 => HASH(0x34007c8)
+# 0 => 2
+# 1 => 3
+# 2 => 1
+# 4 => HASH(0x3400810)
+# 0 => 6
+# 1 => 1
+# 2 => 1
+# 'term_list' => ['foo', 'four', 'three'],
+# 'terms' => HASH(0x34012e4)
+# 'foo' => 0
+# 'four' => 1
+# 'three' => 2
+#
+sub _update_term_score {
+ my ($self, %args) = @_;
+
+ # scoring is enabled?
+ # return unless (exists $self->{scoring} &&
$self->{scoring} == 1);
+
+ # save term (in term_id table)
+ unless (exists $self->{score}{terms}{$args{term}}) {
+ # add to current list of all terms
+ push @{$self->{score}{term_list}}, $args{term};
+ #
+ $self->{score}{terms}{$args{term}} = (exists
$self->{score}{term_list})
+ ?
scalar(@{$self->{score}{term_list}})
+
: 0;
+ }
+
+ my $term_id = $self->{score}{terms}{$args{term}};
+ $self->{score}{docs}{$args{docid}}{$term_id} += $args{count};
+ $self->{score}{term_doc_count}{$term_id}++;
+}
1;
Only in D:\vlad\programming\perl\lib_dev\DBIx:
FullTextSearch.pm~
Only in D:\vlad\programming\perl\lib_dev\DBIx:
FullTextSearch.pm~~
Logged In: YES
user_id=46353
Tests failed when this patch was applied. Awaiting an updated
patch.