DBIx::FullTextSearch / Patches / #1 (beta) Lucene search algorithm

Logged In: YES
user_id=498663

-------- PATCH: ------------------------------------------

Only in D:\vlad\programming\perl\lib_dev\DBIx: #DEV_NOTES#
Only in D:\vlad\programming\perl\lib_dev\DBIx: DEV_NOTES
Only in D:\vlad\programming\perl\lib_dev\DBIx: DEV_NOTES~
diff -ur --exclude=CVS
D:\vlad\lib\perl\DBIx/FullTextSearch/Phrase.pm
D:\vlad\programming\perl\lib_dev\DBIx/FullTextSearch/Phrase.pm
--- D:\vlad\lib\perl\DBIx/FullTextSearch/Phrase.pm Sat Feb 23 01:32:38 2002
+++ D:\vlad\programming\perl\lib_dev\DBIx/FullTextSearch/Phrase.pm Thu Apr 18 14:32:18 2002
@@ -113,6 +113,8 @@
};
my $out = {};

+#
$DB::single = 1;
+
for my $phrase (@_){

my @words = split\(' ', $phrase\);

@@ -151,14 +153,22 @@

            \}

            delete $cur\_pos\{$doc\} unless $isPhrase;

        \}

-
}
+
}

\}


for my $doc \(keys %cur\_pos\)\{

-
my @positions = keys %{$cur_pos{$doc}};
-
$out->{$doc} += scalar (@positions);
+
my @positions = keys %{$cur_pos{$doc}};
+
$out->{$doc} += scalar (@positions);
+

+
$fts->_update_term_score(
+
term => $phrase,
+
count => scalar (@positions),
+
docid => $doc,
+
);
+

\}
\}

return $out;

}

Only in
D:\vlad\programming\perl\lib_dev\DBIx/FullTextSearch: Phrase.pm~
diff -ur --exclude=CVS
D:\vlad\lib\perl\DBIx/FullTextSearch.pm
D:\vlad\programming\perl\lib_dev\DBIx/FullTextSearch.pm
--- D:\vlad\lib\perl\DBIx/FullTextSearch.pm Sat Feb 23 04:42:24 2002
+++ D:\vlad\programming\perl\lib_dev\DBIx/FullTextSearch.pm Sat Apr 20 16:21:00 2002
@@ -366,6 +366,7 @@
my @words = eval $filter;
@words = grep !$stoplist->is_stop_word($_), @words if
defined($stoplist);
@words = @{$stemmer->stem(@words)} if defined($stemmer);
+
for my $word ( @words ) {

push @\{$words\{$word\}\}, ++$i;
\}

@@ -423,14 +424,26 @@

$phrase =~ s/\\\*/%/g;

push @phrases, $phrase;
\}

+
+
# calculate frequencies of each term (ak'a phrase) in
+
# this query.
+
my $n_terms = @phrases;
+
$self->{'query_term_f'}{$_} += 1/$n_terms for (@phrases);
+

$self-&amp;gt;\{'db\_backend'\}-&amp;gt;contains\_hashref\(@phrases\);

}

sub contains {
my $self = shift;
-
my $res = $self->contains_hashref(@_);
+
$self->{'query_term_f'} = undef;
+
my $res = $self->contains_hashref(@_);
if (not $self->{'count_bits'}) { return keys %$res; }
-
return sort { $res->{$b} <=> $res->{$a} } keys %$res;
+
+
# term count is possible if count_bits exists, therefore,
+
# calculate document score.
+
return $self->calculate_doc_scores();
+
+
#return sort { $res->{$b} <=> $res->{$a} } keys %$res;
}

sub econtains_hashref {
@@ -482,6 +495,7 @@

sub econtains {
my $self = shift;
+
$self->{'query_term_f'} = undef;
my $res = $self->econtains_hashref(@_);
if (not $self->{'count_bits'}) { return keys %$res; }
return sort { $res->{$b} <=> $res->{$a} } keys %$res;
@@ -548,6 +562,158 @@
$self->{'db_backend'}->common_word($k);
}

+
+#
+# calculate scores for each found document
+# and return reference to a hash of this form:
+# ...
+# document id => document score
+# ...
+#
+# Scoring is based on Lucene's scoring algorithm.
+#
+# Formula:
+# score_d = sum_t( tf_q * idf_t / norm_q * tf_d * idf_t /
norm_d_t)
+# where:
+# score_d : score for document d
+# sum_t : sum for all terms t
+# tf_q : the square root of the frequency of t in
the query
+# tf_d : the square root of the frequency of t in d
+# numDocs : number of documents in index
+# docFreq_t : number of documents containing t
+# idf_t : log(numDocs/docFreq_t+1) + 1.0
+# norm_q : sqrt(sum_t((tf_q*idf_t)^2))
+# norm_d_t : square root of number of tokens in d in the
same field as t
+#
+sub calculate_doc_scores {
+ my $self = shift;
+
+ # TODO: should probably use DBI's prepare_cached instead
+ my $doc_word_count_sth = $self->_prepare_cached(
+
name => 'doc_word_count_sth',
+
statement => "select count(*) from $self->{data_table} where
doc_id = ?"
+
);
+
+ # numDocs
+ my $doc_count = $self->document_count();
+
+ # will contain document scores
+ my %res;
+
+ foreach my $doc (keys %{$self->{score}{docs}}) {
+ $doc_word_count_sth->execute($doc)
+ or die $doc_word_count_sth->errstr;
+
+ my $res = $doc_word_count_sth->fetch();
+ my ($doc_word_count) = $res->[0];
+ my $doc_hits = $self->{score}{docs}{$doc};
+ my ($doc_score, $tf_q, $idf_t, $doc_term_freq) = (0)x4;
+
+ # calculate norm_q for all terms.
+ # using this formula:
+ # sqrt(sum_t((tf_q*idf_t)^2))
+ #
+ # where idf_t = log(doc_count/doc_term_freq + 1) + 1
+ my (%term_vars, $term, $norm_q);
+
+ foreach my $term_id (keys %$doc_hits) {
+ $term = $self->{score}{term_list}[$term_id];
+ $term_vars{$term_id}{tf_q} =
sqrt($self->{query_term_f}{$term});
+ # docFreq_t -- number of documents containing given term
+ $term_vars{$term_id}{doc_term_freq} =
$self->{score}{term_doc_count}{$term_id};
+ $term_vars{$term_id}{idf_t} = log($doc_count /
$term_vars{$term_id}{doc_term_freq} + 1) + 1;
+ $norm_q += ($term_vars{$term_id}{tf_q} *
$term_vars{$term_id}{idf_t})**2;
+ }
+
+ $norm_q = sqrt($norm_q);
+
+ foreach my $term_id (keys %$doc_hits) {
+ my $term = $self->{term_list}->[$term_id];
+ my $tf_d = sqrt($doc_hits->{$term_id} / $doc_word_count);
+ # sum_t( tf_q * idf_t / norm_q * tf_d * idf_t)
+ # or
+ # sum_t( tf_q * tf_d * idf_t^2 / norm_q)
+ $doc_score += $term_vars{$term_id}{tf_q} *
($term_vars{$term_id}{idf_t}**2) * $tf_d / $norm_q ;
+ }
+
+ $res{$doc} = $doc_score;
+ }
+
+ return \%res;
+}
+
+sub _prepare_cached {
+ my ($self, %args) = @_;
+
+ my $dbh = $self->{dbh};
+
+ return (defined $self->{$args{name}}
+
? $self->{$args{name}}
+: $self->{$args{name}} = $dbh->prepare($args{statement})
or die $dbh->errstr);
+}
+
+#
+# this method is called whenever a new term/phrase is found
+# in a document. All data related to this term (number of
+# occurances, id of a document it was found in etc) is then
+# saved in 'score' hash of this object. This score hash
+# will in turn be used to calculate final document scores
+# when the search is complete.
+#
+# %args = (
+# term => phrase or term that was located in a document
+# count => number of times this phrase/term appears in
the document
+# docid => id of the document where the phrase/term was
found.
+# )
+#
+# The score hash has the following structure:
+# example score hash (dump of a sample):
+#
+# DB<1> x $self->{score}
+# 0 HASH(0x34012a8)
+# 'docs' => HASH(0x3401314)
+# 1 => HASH(0x340132c)
+# 0 => 2
+# 1 => 1
+# 2 => 1
+# 2 => HASH(0x34007a4)
+# 0 => 4
+# 1 => 2
+# 2 => 2
+# 3 => HASH(0x34007c8)
+# 0 => 2
+# 1 => 3
+# 2 => 1
+# 4 => HASH(0x3400810)
+# 0 => 6
+# 1 => 1
+# 2 => 1
+# 'term_list' => ['foo', 'four', 'three'],
+# 'terms' => HASH(0x34012e4)
+# 'foo' => 0
+# 'four' => 1
+# 'three' => 2
+#
+sub _update_term_score {
+ my ($self, %args) = @_;
+
+ # scoring is enabled?
+ # return unless (exists $self->{scoring} &&
$self->{scoring} == 1);
+
+ # save term (in term_id table)
+ unless (exists $self->{score}{terms}{$args{term}}) {
+ # add to current list of all terms
+ push @{$self->{score}{term_list}}, $args{term};
+ #
+ $self->{score}{terms}{$args{term}} = (exists
$self->{score}{term_list})
+ ?
scalar(@{$self->{score}{term_list}})
+
: 0;
+ }
+
+ my $term_id = $self->{score}{terms}{$args{term}};
+ $self->{score}{docs}{$args{docid}}{$term_id} += $args{count};
+ $self->{score}{term_doc_count}{$term_id}++;
+}

Only in D:\vlad\programming\perl\lib_dev\DBIx:
FullTextSearch.pm~
Only in D:\vlad\programming\perl\lib_dev\DBIx:
FullTextSearch.pm~~

(beta) Lucene search algorithm

Group

Searches

Help

#1 (beta) Lucene search algorithm

Discussion