[cvs] bogofilter bogoutil.c,1.53,1.53.2.1 maint.c,1.10,1.10.2.1
Fast Bayesian spam filter along lines suggested by Paul Graham
Brought to you by:
m-a
From: <m-...@us...> - 2003-01-14 04:08:38
|
Update of /cvsroot/bogofilter/bogofilter In directory sc8-pr-cvs1:/tmp/cvs-serv7052 Modified Files: Tag: bf-092-database-1 bogoutil.c maint.c Log Message: Use the db_foreach iterator. Do not lock. Forget about datastore_db internals. Index: bogoutil.c =================================================================== RCS file: /cvsroot/bogofilter/bogofilter/bogoutil.c,v retrieving revision 1.53 retrieving revision 1.53.2.1 diff -u -d -r1.53 -r1.53.2.1 --- bogoutil.c 13 Jan 2003 21:38:58 -0000 1.53 +++ bogoutil.c 14 Jan 2003 04:08:35 -0000 1.53.2.1 @@ -23,10 +23,11 @@ #include <config.h> #include "common.h" +#include "xmalloc.h" +#include "xstrdup.h" #include "bogofilter.h" #include "datastore.h" -#include "datastore_db.h" #include "error.h" #include "maint.h" #include "robinson.h" /* for ROBS and ROBX */ @@ -40,79 +41,130 @@ run_t run_type = RUN_NORMAL; const char *progname = PROGNAME; -char *directory; -static int dump_file(char *db_file) +static int db_dump_hook(char *key, long keylen, char *data, long + datalen, void *userdata /*@unused@*/) { - dbh_t *dbh; + dbv_t val = {0, 0}; + (void)userdata; - DBC dbc; - DBC *dbcp; - DBT key, data; + if (datalen != 4 && datalen != 8) { + print_error(__FILE__, __LINE__, "Unknown data size - %ld.\n", datalen); + return 0; + } - int ret; - int rv = 0; + memcpy(&val, data, datalen); - dbcp = &dbc; + if (!keep_count(val.count) || !keep_date(val.date) || !keep_size(keylen)) + return 0; + if (replace_nonascii_characters) + do_replace_nonascii_characters((byte *)key, keylen); + fwrite(key, 1, keylen, stdout); + putchar(' '); + printf("%lu", val.count); + if (val.date) { + printf(" %lu", val.date); + } + putchar('\n'); + return !!ferror(stdout); +} - memset(&key, 0, sizeof(DBT)); - memset(&data, 0, sizeof(DBT)); +static int count_hook(char *key, long keylen, char *data, long + datalen, void *userdata) +{ + long *counter = userdata; - if ((dbh = db_open(db_file, db_file, DB_READ, directory)) == NULL) { - rv = 2; + (void)key; + (void)keylen; + (void)data; + (void)datalen; + + (*counter)++; + return 0; +} + +struct robhook_data { + double *sum; + long *count; + void *dbh_good; + double scalefactor; +}; + +static int robx_hook(char *key, long keylen, char *data, long + datalen, void *userdata) +{ + struct robhook_data *rd = userdata; + + long goodness; + unsigned long spamness; + double prob; + static int x_size = 40; + static char *x; + + /* ignore system meta-data */ + if (*key == '.') + return 0; + + /* ignore short read */ + if (datalen < 4) + return 0; + + if (keylen + 1 > x_size) { + free(x); + x = NULL; + x_size = keylen + 1; } - else { - db_lock_reader(dbh); + if (!x) x = xmalloc(x_size); - if ((ret = dbh->dbp->cursor(dbh->dbp, NULL, &dbcp, 0) != 0)) { - dbh->dbp->err(dbh->dbp, ret, PROGNAME " (cursor): %s", db_file); - rv = 2; - } - else { - for (;;) { - ret = dbcp->c_get(dbcp, &key, &data, DB_NEXT); - if (ret == 0) { - long cv[2]; - dbv_t val; - memcpy( &cv, data.data, data.size ); - if (!dbh->is_swapped){ /* convert from struct to array */ - val.count = cv[0]; - val.date = cv[1]; - } else { - val.count = swap_32bit(cv[0]); - val.date = swap_32bit(cv[1]); - } - if (!keep_count(val.count) || !keep_date(val.date) || !keep_size(key.size)) - continue; - if (replace_nonascii_characters) - do_replace_nonascii_characters((byte *)key.data, key.size); - if (data.size != 4 && data.size != 8) - print_error(__FILE__, __LINE__, "Unknown data size - %d.\n", data.size); - if (data.size == 4 || val.date == 0) - printf("%.*s %lu\n", (int)key.size, (char *) key.data, val.count); - else - printf("%.*s %lu %lu\n", (int)key.size, (char *) key.data, val.count, val.date); - } - else if (ret == DB_NOTFOUND) { - break; - } - else { - dbh->dbp->err(dbh->dbp, ret, PROGNAME " (c_get)"); - rv = 2; - break; - } - } + memcpy(x, key, keylen); + x[keylen] = '\0'; + + memcpy(&spamness, data, 4); + goodness = db_getvalue(rd->dbh_good, x); + + prob = spamness / (goodness * rd->scalefactor + spamness); + (*rd->sum) += prob; + + /* tokens in good list were already counted */ + /* now add in tokens only in spam list */ + if (goodness == 0) + (*rd->count) ++; + + /* print if token in both word lists */ + if (verbose > 1 && (goodness && spamness)) + printf("cnt: %6ld, sum: %12.6f, ratio: %f, sp: %4lu, gd: %4ld," + " p: %f, t: %s\n", *rd->count, *rd->sum, + *rd->sum / *rd->count, spamness, goodness, prob, x); + + return 0; +} + +static int db_oper(const char *path, dbmode_t open_mode, db_foreach_t funct, + void *userdata) { + void *dbh; + + if ((dbh = db_open(path, path, open_mode, directory)) == NULL) { + exit(2); + } else { + int r = db_foreach(dbh, funct, userdata); + if (r) { + db_close(dbh); + return r; } - db_lock_release(dbh); } - return rv; + return 0; +} + +static int dump_file(char *db_file) +{ + return db_oper(db_file, DB_READ, db_dump_hook, NULL); } #define BUFSIZE 512 static int load_file(char *db_file) { - dbh_t *dbh; + void *dbh; byte buf[BUFSIZE]; unsigned char *p; int rv = 0; @@ -126,10 +178,7 @@ memset(buf, '\0', BUFSIZE); - db_lock_writer(dbh); - for (;;) { - if (fgets((char *)buf, BUFSIZE, stdin) == NULL) { if (ferror(stdin)) { perror(PROGNAME); @@ -192,22 +241,11 @@ count += db_getvalue(dbh, (char *)buf); db_setvalue(dbh, (char *)buf, count); } - db_lock_release(dbh); db_close(dbh); return rv; } -#if 0 -static dbh_t *db_open_and_lock_file( const char *db_file, const char *name, dbmode_t mode) -{ - dbh_t *dbh = db_open(db_file, name, mode); - if (dbh != NULL) - db_lock_reader(dbh); - return dbh; -} -#endif - static int get_token(char *buf, int bufsize, FILE *fp) { char *p; @@ -239,7 +277,7 @@ static int words_from_list(const char *db_file, int argc, char **argv) { - dbh_t *dbh; + void *dbh; int rv = 0; dbh = db_open(db_file, db_file, DB_READ, directory); @@ -268,8 +306,8 @@ static int words_from_path(const char *dir, int argc, char **argv, bool show_probability) { - dbh_t *dbh_good; - dbh_t *dbh_spam; + void *dbh_good; + void *dbh_spam; char filepath[PATH_LEN]; char buf[BUFSIZE]; char *token = buf; @@ -279,6 +317,7 @@ const char *head_format = !show_probability ? "%-20s %6s %6s\n" : "%-20s %6s %6s %6s %6s\n"; const char *data_format = !show_probability ? "%-20s %6ld %6ld\n" : "%-20s %6ld %6ld %f %f\n"; + /* XXX FIXME: deadlock possible */ if (build_path(filepath, sizeof(filepath), dir, GOODFILE) < 0) return 2; @@ -307,15 +346,14 @@ { if (get_token(buf, BUFSIZE, stdin) != 0) break; - } - else - { + } else { token = *argv++; if (--argc == 0) argc = -1; } spam_count = db_getvalue(dbh_spam, token); - good_count = db_getvalue(dbh_good, token); + good_count = db_getvalue(dbh_good, token); + if (show_probability) { double spamness = (double) spam_count / (double) spam_msg_count; @@ -352,9 +390,7 @@ words_from_path(path, argc, argv, prob); else words_from_list(path, argc, argv); - } - else - { + } else { if (errno==ENOENT) { fprintf(stderr, "No such directory.\n"); return 0; @@ -367,98 +403,29 @@ return 0; } -static double compute_robx(dbh_t *dbh_spam, dbh_t *dbh_good) +static double compute_robx(void *dbh_spam, void *dbh_good) { - DBT key, data; - int ret = 0; - int word_cnt = 0; + long word_cnt = 0; double sum = 0.0; double robx; - DBC dbc_spam; - DBC *dbcp_spam = &dbc_spam; - DBC dbc_good; - DBC *dbcp_good = &dbc_good; - - double scalefactor; long int msg_good, msg_spam; + struct robhook_data rh; msg_good = db_getvalue( dbh_good, ".MSG_COUNT" ); msg_spam = db_getvalue( dbh_spam, ".MSG_COUNT" ); - scalefactor = (double) msg_spam / (double) msg_good; - - memset(&key, 0, sizeof(DBT)); - memset(&data, 0, sizeof(DBT)); - - if ((ret = dbh_good->dbp->cursor(dbh_good->dbp, NULL, &dbcp_good, 0) != 0)) { - dbh_good->dbp->err(dbh_good->dbp, ret, PROGNAME " (cursor): %s", "dbh_good->file"); - return 2; - } - - for (;;) { - ret = dbcp_good->c_get(dbcp_good, &key, &data, DB_NEXT); - if (ret == 0) { - word_cnt += 1; /* count words in good list */ - } - else if (ret == DB_NOTFOUND) { - break; - } - else { - dbh_good->dbp->err(dbh_spam->dbp, ret, PROGNAME " (c_get)"); - ret = 2; - break; - } - } - dbcp_good->c_close(dbcp_good); - - if ((ret = dbh_spam->dbp->cursor(dbh_spam->dbp, NULL, &dbcp_spam, 0) != 0)) { - dbh_spam->dbp->err(dbh_spam->dbp, ret, PROGNAME " (cursor): %s", "dbh_spam->file"); - return 2; - } - - for (;;) { - ret = dbcp_spam->c_get(dbcp_spam, &key, &data, DB_NEXT); - if (ret == 0) { - long goodness; - unsigned long spamness; - double prob; - char *token = key.data; - token[key.size] = '\0'; - - /* ignore system meta-data */ - if ( *token == '.') - continue; - - spamness = *(unsigned long *) data.data; - goodness = db_getvalue(dbh_good, token); - - prob = spamness / (goodness*scalefactor + spamness); - sum += prob; - - /* tokens in good list were already counted */ - /* now add in tokens only in spam list */ - if (goodness == 0) - word_cnt += 1; + rh.scalefactor = (double)msg_spam/msg_good; + rh.dbh_good = dbh_good; + rh.sum = ∑ + rh.count = &word_cnt; - /* print if token in both word lists */ - if (verbose > 1 && (goodness && spamness)) - printf("cnt: %6d, sum: %12.6f, ratio: %f, sp: %4lu, gd: %4ld, p: %f, t: %s\n", word_cnt, sum, sum/word_cnt, spamness, goodness, prob, token); - } - else if (ret == DB_NOTFOUND) { - break; - } - else { - dbh_spam->dbp->err(dbh_spam->dbp, ret, PROGNAME " (c_get)"); - ret = 2; - break; - } - } - dbcp_spam->c_close(dbcp_spam); + db_foreach(dbh_good, count_hook, &word_cnt); + db_foreach(dbh_spam, robx_hook, &rh); robx = sum/word_cnt; if (verbose) printf( ".MSG_COUNT: %ld, %ld, scale: %f, sum: %f, cnt: %6d, .ROBX: %f\n", - msg_spam, msg_good, scalefactor, sum, (int)word_cnt, robx); + msg_spam, msg_good, rh.scalefactor, sum, (int)word_cnt, robx); return robx; } @@ -466,9 +433,9 @@ static int compute_robinson_x(char *path) { int e; + wordlist_t wl[2]; - dbh_t *dbh_good; - dbh_t *dbh_spam; + void *dbh_spam; char db_spam_file[PATH_LEN]; char db_good_file[PATH_LEN]; @@ -480,21 +447,27 @@ e = build_path(db_good_file, sizeof(db_good_file), path, "goodlist.db"); if (e < 0) goto overflow; - dbh_good = db_open(db_good_file, "good", DB_READ, directory); - dbh_spam = db_open(db_spam_file, "spam", DB_WRITE, directory); + memset(wl, 0, sizeof(wl)); - db_lock_reader(dbh_good); - db_lock_writer(dbh_spam); + wl[0].next = &wl[1]; + wl[0].filepath = db_good_file; + wl[0].filename = xstrdup("good"); - robx = compute_robx( dbh_spam, dbh_good ); + wl[1].next = NULL; + wl[1].filepath = db_spam_file; + wl[1].filename = xstrdup("spam"); - db_setvalue(dbh_spam, ".ROBX", (int) (robx * 1000000)); + word_lists = wl; + open_wordlists(DB_READ); - db_lock_release(dbh_spam); - db_close(dbh_spam); + robx = compute_robx(wl[1].dbh, wl[0].dbh); + close_wordlists(); + free(wl[0].filename); + free(wl[1].filename); - db_lock_release(dbh_good); - db_close(dbh_good); + dbh_spam = db_open(db_spam_file, "spam", DB_WRITE, directory); + db_setvalue(dbh_spam, ".ROBX", (int) (robx * 1000000)); + db_close(dbh_spam); return 0; overflow: Index: maint.c =================================================================== RCS file: /cvsroot/bogofilter/bogofilter/maint.c,v retrieving revision 1.10 retrieving revision 1.10.2.1 diff -u -d -r1.10 -r1.10.2.1 --- maint.c 13 Jan 2003 15:11:36 -0000 1.10 +++ maint.c 14 Jan 2003 04:08:35 -0000 1.10.2.1 @@ -21,9 +21,9 @@ #include <assert.h> #include "common.h" +#include "xmalloc.h" #include "datastore.h" -#include "datastore_db.h" #include "maint.h" YYYYMMDD today; /* date as YYYYMMDD */ @@ -117,82 +117,58 @@ set_list_active_status(true); - db_lock_writer_list(word_lists); - for (list = word_lists; list != NULL; list = list->next) { maintain_wordlist(list->dbh); list = list->next; } - - db_lock_release_list(word_lists); } int maintain_wordlist_file(const char *db_file) { int rc; - dbh_t *dbh; + void *dbh; dbh = db_open(db_file, db_file, DB_WRITE, directory); if (dbh == NULL) return 2; - db_lock_writer(dbh); - rc = maintain_wordlist(dbh); - db_lock_release(dbh); - return rc; } -int maintain_wordlist(void *vhandle) +static int maintain_hook(char *key, long keylen, char *data, long + datalen, void *userdata /*@unused@*/) { - dbh_t *dbh = vhandle; - - DBC dbc; - DBC *dbcp; - DBT db_key, db_data; - - int ret; - int rv = 0; - dbv_t *val; - - dbcp = &dbc; + static int x_size = 40; + static char *x; + dbv_t val; + memcpy(&val, data, datalen); - memset(&db_key, 0, sizeof(DBT)); - memset(&db_data, 0, sizeof(DBT)); + (void)datalen; + if (replace_nonascii_characters) + do_replace_nonascii_characters((byte *)key,keylen); - if ((ret = dbh->dbp->cursor(dbh->dbp, NULL, &dbcp, 0) != 0)) { - dbh->dbp->err(dbh->dbp, ret, "%s (cursor): %s", progname, dbh->filename); - return (2); - } + if (!keep_count(val.count) || !keep_date(val.date) || !keep_size(keylen)) { + if (keylen + 1 > x_size) { + free(x); + x = NULL; + x_size = keylen + 1; + } + if (!x) x = xmalloc(x_size); - for (;;) { - ret = dbcp->c_get(dbcp, &db_key, &db_data, DB_NEXT); - if (ret == 0) { - val = (dbv_t *)db_data.data; - if (replace_nonascii_characters) - do_replace_nonascii_characters((byte *)db_key.data,db_key.size); + memcpy(x, key, keylen); + x[keylen] = '\0'; - if (! keep_count(val->count) || ! keep_date(val->date) || ! keep_size(db_key.size)) { - char *token = (char *) db_key.data; - int rc1, rc2; - token[db_key.size] = '\0'; - rc1 = dbcp->c_del(dbcp, 0); - rc2 = dbh->dbp->del(dbh->dbp, NULL, &db_key, 0); + db_delete(userdata, x); - if (DEBUG_DATABASE(0)) fprintf(dbgout, "deleting %s --> %d, %d\n", (char *)db_key.data, rc1, rc2); - } - } - else if (ret == DB_NOTFOUND) { - break; - } - else { - dbh->dbp->err(dbh->dbp, ret, "%s (c_get)", progname); - rv = 2; - break; - } + if (DEBUG_DATABASE(0)) fprintf(dbgout, "deleting %s\n", x); } + return 0; +} - return rv; +int maintain_wordlist(void *vhandle) +{ + void *dbh = vhandle; + return db_foreach(dbh, maintain_hook, dbh); } |