Learn how easy it is to sync an existing GitHub or Google Code repo to a SourceForge project! See Demo

Close

#1 (beta) Lucene search algorithm

open
None
7
2002-04-20
2002-04-20
No

I have finally got the time to finish coding beta
version of the
Lucene searching algorithm for FullTextSearch module.

The patch file and test scripts should be found inside
text_search.rar archive file. The archive should contain
these files:
FullTextSearch.patch -- patch file (using
WinPatchMaker-1.0)
index_search.conf -- configuration file for my index
searcher test script.
index_search_init.pl -- run this first to initialize
index tables etc.
index_search_test.pl -- the main test file (contains a
few test cases and allows you to easily add your own).

Since this is only a 'beta' release of the algorithm
implementation,
apply the patch against a copy of the FullTextSearch
module.
In this release, scoring has been implemented for the
phrase
backend only. Also, for now I assume numerical
document ids (rather
than strings as whould be the case with the String
backend?) only.
Adding scoring to other backends shouldn't be a hard
task since
all major scoring routines are located in the main
FullTextSearch.pm
module. There's actually only a single subroutine that
has to be
invoked from other backend modules in order to enable
scoring for them.

I hope you'll find inline documentation useful.

At this stage it is crucial that we get
comments/suggestions/brilliant
ideas flowing in. ;-). Please, post your thoughts to
the devel mailing list or in the forums.

Cheers,
Vlad.

Discussion

  • beta patch + test scripts.

     
    Attachments
    • priority: 5 --> 7
    • assigned_to: nobody --> tjmather
    • status: open --> pending
     
    • status: pending --> open
     
  • Logged In: YES
    user_id=498663

    -------- PATCH: ------------------------------------------

    Only in D:\vlad\programming\perl\lib_dev\DBIx: #DEV_NOTES#
    Only in D:\vlad\programming\perl\lib_dev\DBIx: DEV_NOTES
    Only in D:\vlad\programming\perl\lib_dev\DBIx: DEV_NOTES~
    diff -ur --exclude=CVS
    D:\vlad\lib\perl\DBIx/FullTextSearch/Phrase.pm
    D:\vlad\programming\perl\lib_dev\DBIx/FullTextSearch/Phrase.pm
    --- D:\vlad\lib\perl\DBIx/FullTextSearch/Phrase.pm Sat Feb 23 01:32:38 2002
    +++ D:\vlad\programming\perl\lib_dev\DBIx/FullTextSearch/Phrase.pm Thu Apr 18 14:32:18 2002
    @@ -113,6 +113,8 @@
    };
    my $out = {};

    +#
    $DB::single = 1;
    +
    for my $phrase (@_){

    my @words = split\(' ', $phrase\);
    

    @@ -151,14 +153,22 @@

                \}
    
                delete $cur\_pos\{$doc\} unless $isPhrase;
    
            \}
    

    -
    }
    +
    }

    \}
    
    for my $doc \(keys %cur\_pos\)\{
    

    -
    my @positions = keys %{$cur_pos{$doc}};
    -
    $out->{$doc} += scalar (@positions);
    +
    my @positions = keys %{$cur_pos{$doc}};
    +
    $out->{$doc} += scalar (@positions);
    +

    +
    $fts->_update_term_score(
    +
    term => $phrase,
    +
    count => scalar (@positions),
    +
    docid => $doc,
    +
    );
    +

    \}
    \}
    

    +

    return $out;
    

    }

    Only in
    D:\vlad\programming\perl\lib_dev\DBIx/FullTextSearch: Phrase.pm~
    diff -ur --exclude=CVS
    D:\vlad\lib\perl\DBIx/FullTextSearch.pm
    D:\vlad\programming\perl\lib_dev\DBIx/FullTextSearch.pm
    --- D:\vlad\lib\perl\DBIx/FullTextSearch.pm Sat Feb 23 04:42:24 2002
    +++ D:\vlad\programming\perl\lib_dev\DBIx/FullTextSearch.pm Sat Apr 20 16:21:00 2002
    @@ -366,6 +366,7 @@
    my @words = eval $filter;
    @words = grep !$stoplist->is_stop_word($_), @words if
    defined($stoplist);
    @words = @{$stemmer->stem(@words)} if defined($stemmer);
    +
    for my $word ( @words ) {

    push @\{$words\{$word\}\}, ++$i;
    \}
    

    @@ -423,14 +424,26 @@

    $phrase =~ s/\\\*/%/g;
    
    push @phrases, $phrase;
    \}
    

    +
    +
    # calculate frequencies of each term (ak'a phrase) in
    +
    # this query.
    +
    my $n_terms = @phrases;
    +
    $self->{'query_term_f'}{$_} += 1/$n_terms for (@phrases);
    +

    $self->\{'db\_backend'\}->contains\_hashref\(@phrases\);
    

    }

    sub contains {
    my $self = shift;
    -
    my $res = $self->contains_hashref(@_);
    +
    $self->{'query_term_f'} = undef;
    +
    my $res = $self->contains_hashref(@_);
    if (not $self->{'count_bits'}) { return keys %$res; }
    -
    return sort { $res->{$b} <=> $res->{$a} } keys %$res;
    +
    +
    # term count is possible if count_bits exists, therefore,
    +
    # calculate document score.
    +
    return $self->calculate_doc_scores();
    +
    +
    #return sort { $res->{$b} <=> $res->{$a} } keys %$res;
    }

    sub econtains_hashref {
    @@ -482,6 +495,7 @@

    sub econtains {
    my $self = shift;
    +
    $self->{'query_term_f'} = undef;
    my $res = $self->econtains_hashref(@_);
    if (not $self->{'count_bits'}) { return keys %$res; }
    return sort { $res->{$b} <=> $res->{$a} } keys %$res;
    @@ -548,6 +562,158 @@
    $self->{'db_backend'}->common_word($k);
    }

    +
    +#
    +# calculate scores for each found document
    +# and return reference to a hash of this form:
    +# ...
    +# document id => document score
    +# ...
    +#
    +# Scoring is based on Lucene's scoring algorithm.
    +#
    +# Formula:
    +# score_d = sum_t( tf_q * idf_t / norm_q * tf_d * idf_t /
    norm_d_t)
    +# where:
    +# score_d : score for document d
    +# sum_t : sum for all terms t
    +# tf_q : the square root of the frequency of t in
    the query
    +# tf_d : the square root of the frequency of t in d
    +# numDocs : number of documents in index
    +# docFreq_t : number of documents containing t
    +# idf_t : log(numDocs/docFreq_t+1) + 1.0
    +# norm_q : sqrt(sum_t((tf_q*idf_t)^2))
    +# norm_d_t : square root of number of tokens in d in the
    same field as t
    +#
    +sub calculate_doc_scores {
    + my $self = shift;
    +
    + # TODO: should probably use DBI's prepare_cached instead
    + my $doc_word_count_sth = $self->_prepare_cached(
    +
    name => 'doc_word_count_sth',
    +
    statement => "select count(*) from $self->{data_table} where
    doc_id = ?"
    +
    );
    +
    + # numDocs
    + my $doc_count = $self->document_count();
    +
    + # will contain document scores
    + my %res;
    +
    + foreach my $doc (keys %{$self->{score}{docs}}) {
    + $doc_word_count_sth->execute($doc)
    + or die $doc_word_count_sth->errstr;
    +
    + my $res = $doc_word_count_sth->fetch();
    + my ($doc_word_count) = $res->[0];
    + my $doc_hits = $self->{score}{docs}{$doc};
    + my ($doc_score, $tf_q, $idf_t, $doc_term_freq) = (0)x4;
    +
    + # calculate norm_q for all terms.
    + # using this formula:
    + # sqrt(sum_t((tf_q*idf_t)^2))
    + #
    + # where idf_t = log(doc_count/doc_term_freq + 1) + 1
    + my (%term_vars, $term, $norm_q);
    +
    + foreach my $term_id (keys %$doc_hits) {
    + $term = $self->{score}{term_list}[$term_id];
    + $term_vars{$term_id}{tf_q} =
    sqrt($self->{query_term_f}{$term});
    + # docFreq_t -- number of documents containing given term
    + $term_vars{$term_id}{doc_term_freq} =
    $self->{score}{term_doc_count}{$term_id};
    + $term_vars{$term_id}{idf_t} = log($doc_count /
    $term_vars{$term_id}{doc_term_freq} + 1) + 1;
    + $norm_q += ($term_vars{$term_id}{tf_q} *
    $term_vars{$term_id}{idf_t})**2;
    + }
    +
    + $norm_q = sqrt($norm_q);
    +
    + foreach my $term_id (keys %$doc_hits) {
    + my $term = $self->{term_list}->[$term_id];
    + my $tf_d = sqrt($doc_hits->{$term_id} / $doc_word_count);
    + # sum_t( tf_q * idf_t / norm_q * tf_d * idf_t)
    + # or
    + # sum_t( tf_q * tf_d * idf_t^2 / norm_q)
    + $doc_score += $term_vars{$term_id}{tf_q} *
    ($term_vars{$term_id}{idf_t}**2) * $tf_d / $norm_q ;
    + }
    +
    + $res{$doc} = $doc_score;
    + }
    +
    + return \%res;
    +}
    +
    +sub _prepare_cached {
    + my ($self, %args) = @_;
    +
    + my $dbh = $self->{dbh};
    +
    + return (defined $self->{$args{name}}
    +
    ? $self->{$args{name}}
    +
    : $self->{$args{name}} = $dbh->prepare($args{statement})
    or die $dbh->errstr);
    +}
    +
    +#
    +# this method is called whenever a new term/phrase is found
    +# in a document. All data related to this term (number of
    +# occurances, id of a document it was found in etc) is then
    +# saved in 'score' hash of this object. This score hash
    +# will in turn be used to calculate final document scores
    +# when the search is complete.
    +#
    +# %args = (
    +# term => phrase or term that was located in a document
    +# count => number of times this phrase/term appears in
    the document
    +# docid => id of the document where the phrase/term was
    found.
    +# )
    +#
    +# The score hash has the following structure:
    +# example score hash (dump of a sample):
    +#
    +# DB<1> x $self->{score}
    +# 0 HASH(0x34012a8)
    +# 'docs' => HASH(0x3401314)
    +# 1 => HASH(0x340132c)
    +# 0 => 2
    +# 1 => 1
    +# 2 => 1
    +# 2 => HASH(0x34007a4)
    +# 0 => 4
    +# 1 => 2
    +# 2 => 2
    +# 3 => HASH(0x34007c8)
    +# 0 => 2
    +# 1 => 3
    +# 2 => 1
    +# 4 => HASH(0x3400810)
    +# 0 => 6
    +# 1 => 1
    +# 2 => 1
    +# 'term_list' => ['foo', 'four', 'three'],
    +# 'terms' => HASH(0x34012e4)
    +# 'foo' => 0
    +# 'four' => 1
    +# 'three' => 2
    +#
    +sub _update_term_score {
    + my ($self, %args) = @_;
    +
    + # scoring is enabled?
    + # return unless (exists $self->{scoring} &&
    $self->{scoring} == 1);
    +
    + # save term (in term_id table)
    + unless (exists $self->{score}{terms}{$args{term}}) {
    + # add to current list of all terms
    + push @{$self->{score}{term_list}}, $args{term};
    + #
    + $self->{score}{terms}{$args{term}} = (exists
    $self->{score}{term_list})
    + ?
    scalar(@{$self->{score}{term_list}})
    +
    : 0;
    + }
    +
    + my $term_id = $self->{score}{terms}{$args{term}};
    + $self->{score}{docs}{$args{docid}}{$term_id} += $args{count};
    + $self->{score}{term_doc_count}{$term_id}++;
    +}

    1;

    Only in D:\vlad\programming\perl\lib_dev\DBIx:
    FullTextSearch.pm~
    Only in D:\vlad\programming\perl\lib_dev\DBIx:
    FullTextSearch.pm~~

     
  • Logged In: YES
    user_id=46353

    Tests failed when this patch was applied. Awaiting an updated
    patch.