[cvs] SF.net SVN: bogofilter:[6780] trunk/bogofilter/src/score.c
Fast Bayesian spam filter along lines suggested by Paul Graham
Brought to you by:
m-a
From: <re...@us...> - 2009-02-01 13:45:34
|
Revision: 6780 http://bogofilter.svn.sourceforge.net/bogofilter/?rev=6780&view=rev Author: relson Date: 2009-02-01 13:45:32 +0000 (Sun, 01 Feb 2009) Log Message: ----------- Clarify code. Modified Paths: -------------- trunk/bogofilter/src/score.c Modified: trunk/bogofilter/src/score.c =================================================================== --- trunk/bogofilter/src/score.c 2009-02-01 04:12:16 UTC (rev 6779) +++ trunk/bogofilter/src/score.c 2009-02-01 13:45:32 UTC (rev 6780) @@ -61,6 +61,7 @@ /* Function Prototypes */ static double get_spamicity(size_t robn, FLOAT P, FLOAT Q); +static bool need_scoring_boundary(wordhash_t *wh); static double find_scoring_boundary(wordhash_t *wh); static void compute_spamicity(wordhash_t *wh, FLOAT *P, FLOAT *Q, size_t *robn, bool need_stats); @@ -223,14 +224,7 @@ if (DEBUG_ALGORITHM(2)) fprintf(dbgout, "min_dev: %f, robs: %f, robx: %f\n", min_dev, robs, robx); - if (token_count_min + token_count_max + token_count_fix == 0) - { - score.min_dev = min_dev; - } - else - { - score.min_dev = find_scoring_boundary(wh); - } + score.min_dev = !need_scoring_boundary(wh) ? min_dev : find_scoring_boundary(wh); compute_spamicity(wh, &P, &Q, &robn, need_stats); @@ -317,6 +311,55 @@ } } +/* need_scoring_boundary( ) +** determine if min_dev gives a count fitting the token count limits +** return True if so; False if not +*/ +bool need_scoring_boundary(wordhash_t *wh) +{ + size_t count = 0; + + hashnode_t *node; + + // Early out if no token count limits are set + if (token_count_min == 0 && token_count_max == 0 && token_count_fix == 0) + return false; + + // Count scorable tokens + for (node = wordhash_first(wh); node != NULL; node = wordhash_next(wh)) + { + double prob; + wordcnts_t *cnts; + wordprop_t *props; + + if (!fBogotune) { + props = (wordprop_t *) node->buf; + cnts = &props->cnts; + } else { + cnts = (wordcnts_t *) node; + } + + prob = calc_prob(cnts->good, cnts->bad, + cnts->msgs_good, cnts->msgs_bad); + + if (fabs(prob - EVEN_ODDS) >= min_dev) + { + count += 1; + } + } + + // Compare count to limits + if (token_count_min != 0 && count < token_count_min) + return true; + if (token_count_max != 0 && count > token_count_max) + return true; + if (token_count_fix != 0 && token_count_fix != count) + return true; + + // Count outside of limits -- change min_dev + return false; +} + /* find_scoring_boundary( ) ** determine the token score that gives the desired token count ** for scoring the message. This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |