[Bogofilter-cvs] bogofilter datastore_db.c,NONE,1.1 datastore_db.h,NONE,1.1 datastore.h,NONE,1.1 xma
Fast Bayesian spam filter along lines suggested by Paul Graham
Brought to you by:
m-a
Update of /cvsroot/bogofilter/bogofilter In directory usw-pr-cvs1:/tmp/cvs-serv25589 Modified Files: Makefile.am bogofilter.c bogofilter.h main.c Added Files: datastore_db.c datastore_db.h datastore.h xmalloc.c xmalloc.h bogoutil.c Log Message: Modified Files: Makefile.am -- add entries for datastore* + and other new files bogofilter.c bogofilter.h main.c -- fixup to use database abstraction Added Files: datastore_db.c datastore_db.h datastore.h -- database abstraction. Also implements locking xmalloc.c xmalloc.h -- utility bogoutil.c -- dump/restore utility. 1. Implements database abstraction as discussed. Also implements multiple readers/single writer file locking. 2. Adds utility to dump/restore databases. --- NEW FILE: datastore_db.c --- /***************************************************************************** NAME: datastore_db.c -- implements the datastore, using Berkeley DB AUTHOR: Gyepi Sam <gy...@pr...> ******************************************************************************/ #include <fcntl.h> #include <string.h> #include <stdlib.h> #include <sys/types.h> #include <unistd.h> #include <db.h> #include "xmalloc.h" #include "datastore.h" #include "datastore_db.h" #define ERRSTR(str) "bogofilter (db) " str #define DBT_init(dbt) do { memset(&dbt, 0, sizeof(DBT)); } while(0) static dbh_t *dbh_init(char *filename, char *name){ dbh_t *handle; handle = xmalloc(sizeof(dbh_t)); handle->filename = xmalloc(strlen(filename) + 1); strcpy(handle->filename,filename); handle->name = xmalloc(strlen(name)+1); strcpy(handle->name, name); handle->pid = getpid(); return handle; } static void dbh_free(dbh_t *handle){ if (handle == NULL) return; if (handle->filename != NULL) xfree(handle->filename); xfree(handle); } /* Initialize database. Returns: pointer database handle on success, NULL otherwise. */ void *db_open(char *db_file, char *name){ int ret; dbh_t *handle; handle = dbh_init(db_file, name); if ((ret = db_create (&(handle->dbp), NULL, 0)) != 0){ fprintf (stderr, ERRSTR("db_create: %s\n"), db_strerror (ret)); } else if ((ret = handle->dbp->open (handle->dbp, db_file, NULL, DB_BTREE, DB_CREATE, 0664)) != 0){ handle->dbp->err (handle->dbp, ret, ERRSTR("open: %s"), db_file); } else { return (void *)handle; } dbh_free(handle); } /* Retrieve numeric value associated with word. Returns: value if the the word is found in database, 0 if the word is not found. Notes: Will call exit if an error occurs. */ long db_getvalue(void *vhandle, char *word){ DBT db_key; DBT db_data; long value; int ret; dbh_t *handle = vhandle; DBT_init(db_key); DBT_init(db_data); db_key.data = word; db_key.size = strlen(word); if ((ret = handle->dbp->get(handle->dbp, NULL, &db_key, &db_data, 0)) == 0){ value = *(long *)db_data.data; if (verbose > 2){ fprintf(stderr, "db_getvalue (%s): [%s] has value %ld\n",handle->name, word, value); } return(value); } else if (ret == DB_NOTFOUND){ return 0; } else { handle->dbp->err (handle->dbp, ret, ERRSTR("db_getvalue: %s"), word); exit(2); } } /* Store VALUE in database, using WORD as database key Notes: Calls exit if an error occurs. */ void db_setvalue(void *vhandle, char * word, long value){ int ret; DBT key; DBT data; dbh_t *handle = vhandle; DBT_init(key); DBT_init(data); key.data = word; key.size = strlen(word); data.data = &value; data.size = sizeof(long); if ((ret = handle->dbp->put(handle->dbp, NULL, &key, &data, 0)) == 0){ if (verbose > 2){ fprintf(stderr, "db_setvalue (%s): [%s] has value %ld\n", handle->name, word, value); } } else { handle->dbp->err (handle->dbp, ret, ERRSTR("db_setvalue: %s"), word); exit(2); } } /* Increment count associated with WORD, by VALUE. */ void db_increment(void *vhandle, char *word, long value){ value += db_getvalue(vhandle, word); if (value < 0) value = 0; db_setvalue(vhandle, word, value); } /* Decrement count associated with WORD by VALUE, if WORD exists in the database. */ void db_decrement(void *vhandle, char *word, long value){ long n; n = db_getvalue(vhandle, word); if (n > value) n -= value; else n = 0; db_setvalue(vhandle, word, n); } /* Get the number of messages associated with database. */ long db_getcount(void *vhandle){ return db_getvalue(vhandle, MSG_COUNT_TOK); } /* Set the number of messages associated with database. */ void db_setcount(void *vhandle, long count){ db_setvalue(vhandle, MSG_COUNT_TOK, count); } /* Close files and clean up. */ void db_close(void *vhandle){ dbh_t *handle = vhandle; if (handle == NULL) return; handle->dbp->close(handle->dbp, 0); dbh_free(handle); } /* flush any data in memory to disk */ void db_flush(void *vhandle){ dbh_t *handle = vhandle; handle->dbp->sync(handle->dbp, 0); } /* implements locking. */ static int db_lock(dbh_t *handle, int cmd, int type){ int fd; int ret; struct flock lock; if ( (ret = handle->dbp->fd(handle->dbp, &fd)) != 0){ handle->dbp->err (handle->dbp, ret, ERRSTR("db_lock:")); exit(2); } lock.l_type = type; lock.l_start = 0; lock.l_whence = SEEK_END; lock.l_len = 0; return (fcntl(fd, cmd, &lock)); } /* Acquires blocking read lock on database. */ void db_lock_reader(void *vhandle){ dbh_t *handle = vhandle; if (verbose > 1) fprintf(stderr, "[%ld] Acquiring read lock on %s\n",handle->pid, handle->filename); if (db_lock(handle, F_SETLKW, F_RDLCK) != 0){ fprintf(stderr, "[%ld] Error acquiring read lock on %s\n",handle->pid, handle->filename); exit(2); } if (verbose > 1) fprintf(stderr, "[%ld] Got read lock on %s\n",handle->pid, handle->filename); } /* Acquires blocking write lock on database. */ void db_lock_writer(void *vhandle){ dbh_t *handle = vhandle; if (verbose > 1) fprintf(stderr, "[%ld] Acquiring write lock on %s\n",handle->pid, handle->filename); if (db_lock(handle, F_SETLKW, F_WRLCK) != 0){ fprintf(stderr, "[%ld] Error acquiring write lock on %s\n",handle->pid, handle->filename); exit(2); } if (verbose > 1) fprintf(stderr, "[%ld] Got write lock on %s\n",handle->pid, handle->filename); } /* Releases acquired lock */ void db_lock_release(void *vhandle){ dbh_t *handle = vhandle; if (verbose > 1) fprintf(stderr, "[%ld] Releasing lock on %s\n",handle->pid, handle->filename); if (db_lock(handle, F_SETLK, F_UNLCK) != 0){ fprintf(stderr, "[%ld] Error releasing on %s\n", handle->pid, handle->filename); exit(2); } } --- NEW FILE: datastore_db.h --- #ifndef DATASTORE_DB3_H_GUARD #define DATASTORE_DB3_H_GUARD #define MSG_COUNT_TOK ".MSG_COUNT" typedef struct { char *filename; char *name; DB *dbp; pid_t pid; } dbh_t; extern int verbose; #endif --- NEW FILE: datastore.h --- /* API for bogofilter datastore. The idea here is to make bogofilter independent of the database system used to store words. The interface specified by this file determines the entire interaction between bogofilter and the database Writing a new database backend merely requires the implementation of the interface Author: Gyepi Sam <gy...@pr...> */ #ifndef DATASTORE_H_GUARD #define DATASTORE_H_GUARD /* Initialize database, open files, etc. params: char * path to database file, char * name of database returns: opaque pointer to database handle, which must be saved and passed as the first parameter in all subsequent database function calls. */ void *db_open(char *, char *); /* Close files and clean up. */ void db_close(void *); /* Flush pending writes to disk */ void db_flush(void *); /* Increments count for given word. Note: negative results are set to zero, */ void db_increment(void *, char *, long); /* Decrement count for a given word, if it exists in the datastore. Note: negative results are set to zero, */ void db_decrement(void *, char *, long); /* Retrieve the value associated with a given word in a list Returns zero if the word does not exist in the database. */ long db_getvalue(void *, char *); /* Set the value associated with a given word in a list */ void db_setvalue(void *, char *, long); /* Get the database message count */ long db_getcount(void*); /* set the database message count */ void db_setcount(void*, long); /* Acquire a reader lock on database. Caller is blocked until the lock is granted. Multiple readers can have simulatenous locks. When a writer lock exists, reader lock requests are blocked until the lock becomes available. */ void db_lock_reader(void *); /* Acquire a writer lock on database. Caller is blocked until the lock is granted. Only one writer lock can exist at any time. Writer lock requests are blocked until all reader locks are released. */ void db_lock_writer(void *); /* Release acquired lock */ void db_lock_release(void *); #endif --- NEW FILE: xmalloc.c --- /* * NAME: * xmalloc.c -- front-end to standard heap manipulation routines, with error checking. * * AUTHOR: * Gyepi Sam <gy...@pr...> * */ #include <stdio.h> #include <stdlib.h> #include "xmalloc.h" #define mem_error(a) do { fprintf(stderr, a ": Out of memory\n"); abort(); } while(0) void * xmalloc(size_t size){ char *x; if (!(x = malloc(size))) mem_error("xmalloc"); return (void *)x; } void xfree(void *ptr){ if (ptr) free(ptr); } void *xcalloc(size_t nmemb, size_t size){ char *x; if (!(x = calloc(nmemb, size))) mem_error("xcalloc"); return (void *)x; } void *xrealloc(void *ptr, size_t size){ char *x; if (!(x = realloc(ptr, size))) mem_error("xrealloc"); return(void *)x; } --- NEW FILE: xmalloc.h --- #ifndef XMALLOC_H_GUARD #define XMALLOC_H_GUARD #include <stdlib.h> void *xmalloc(size_t size); void xfree(void *ptr); void *xcalloc(size_t nmemb, size_t size); void *xrealloc(void *ptr, size_t size); #endif --- NEW FILE: bogoutil.c --- /***************************************************************************** NAME: bogoutil.c -- dumps and loads bogofilter text files from/to Berkeley DB format. AUTHOR: Gyepi Sam <gy...@pr...> ******************************************************************************/ #include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <ctype.h> #include <db.h> #include "datastore.h" #include "datastore_db.h" #define PROGNAME "bogoutil" #define VERSION "0.1" int verbose = 0; int dump_file (char *file) { int ret; DB db; DB *dbp; DBC dbc; DBC *dbcp; DBT key, data; dbp = &db; dbcp = &dbc; if ((ret = db_create (&dbp, NULL, 0)) != 0) { fprintf (stderr, PROGNAME " (db_create): %s\n", db_strerror (ret)); return 1; } if ((ret = dbp->open (dbp, file, NULL, DB_BTREE, 0, 0)) != 0) { dbp->err (dbp, ret, PROGNAME " (open): %s", file); return 1; } if ((ret = dbp->cursor (dbp, NULL, &dbcp, 0) != 0)) { dbp->err (dbp, ret, PROGNAME " (cursor): %s", file); return 1; } memset (&key, 0, sizeof (DBT)); memset (&data, 0, sizeof (DBT)); for (;;) { ret = dbcp->c_get (dbcp, &key, &data, DB_NEXT); if (ret == 0) { printf ("%.*s %lu\n", key.size, (char *) key.data, *(unsigned long *) data.data); } else if (ret == DB_NOTFOUND) { break; } else { dbp->err (dbp, ret, PROGNAME " (c_get)"); break; } } return 0; } #define BUFSIZE 512 int load_file(char *db_file){ dbh_t *dbh; char buf[BUFSIZE]; char *p; int rv = 0; size_t len; long line = 0; long count; if ( (dbh = db_open(db_file, db_file)) == NULL) return 2; memset(buf, '\0', BUFSIZE); for (;;) { if (fgets(buf, BUFSIZE, stdin) == NULL){ if (ferror(stdin)){ perror(PROGNAME); rv = 2; } break; } line++; len = strlen(buf); /* too short.*/ if (len < 4) continue; p = &buf[len - 1]; if (*(p--) != '\n'){ fprintf(stderr, PROGNAME ": Unexpected input [%s] on line %lu. Does not end with newline\n", buf, line); rv = 1; break; } while(isdigit(*p)) p--; if (!isspace(*p)){ fprintf(stderr, PROGNAME ": Unexpected input [%s] on line %lu. Expecting space before count\n", buf, line); rv = 1; break; } count = atol(p + 1); while(isspace(*p)) p--; *(p+1) = '\0'; /* Slower, but allows multiple lists to be concatenated */ db_increment(dbh, buf, count); } db_close(dbh); return rv; } void version (void) { fprintf (stderr, PROGNAME ": version: " VERSION "\n"); exit (0); } void usage (void) { fprintf (stderr, "Usage: %s -d | -l [ -V ] [ -h ] file.db\n", PROGNAME); exit (1); } void help (void) { fprintf (stderr, "Help: %s\n", PROGNAME); fprintf (stderr, "Read UPGRADE for help\n"); exit (0); } void ensure_uniq_flag (int flag) { if (flag != 0) { fprintf (stderr, PROGNAME ": Flags -d and -l are mutually exclusive.\n"); exit (2); } } int main (int argc, char *argv[]) { enum { DUMP = 1, LOAD = 2 }; char *db_file; int ch; int flag = 0; while ((ch = getopt (argc, argv, "dlhV")) != -1) switch (ch) { case 'd': ensure_uniq_flag (flag); flag = DUMP; break; case 'l': ensure_uniq_flag (flag); flag = LOAD; break; case 'h': help (); /* unreachable */ break; case 'V': version (); /* unreachable */ break; default: usage (); /* unreachable */ break; } /* Extra or missing parameters */ if ((argc - optind) != 1){ fprintf (stderr, PROGNAME ": Exacly one filename argument is expected.\n"); exit (2); } db_file = argv[optind++]; if (flag == DUMP) { return dump_file (db_file); } else if (flag == LOAD) { return load_file (db_file); } else { fprintf (stderr, PROGNAME ": One of the flags -d or -l is required\n"); exit (2); } } Index: Makefile.am =================================================================== RCS file: /cvsroot/bogofilter/bogofilter/Makefile.am,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** Makefile.am 18 Sep 2002 23:33:48 -0000 1.9 --- Makefile.am 24 Sep 2002 04:34:19 -0000 1.10 *************** *** 1,4 **** --- 1,21 ---- # $Id$ # $Log$ + # Revision 1.10 2002/09/24 04:34:19 gyepi + # + # + # Modified Files: + # Makefile.am -- add entries for datastore* + and other new files + # bogofilter.c bogofilter.h main.c -- fixup to use database abstraction + # + # Added Files: + # datastore_db.c datastore_db.h datastore.h -- database abstraction. Also implements locking + # xmalloc.c xmalloc.h -- utility + # bogoutil.c -- dump/restore utility. + # + # 1. Implements database abstraction as discussed. + # Also implements multiple readers/single writer file locking. + # + # 2. Adds utility to dump/restore databases. + # # Revision 1.9 2002/09/18 23:33:48 relson # Added lexer.h to bogofilter_SOURCES list. *************** *** 40,52 **** # what to build ! bin_PROGRAMS = bogofilter noinst_PROGRAMS = lexertest man1_MANS = bogofilter.1 # what to build that from ! bogofilter_SOURCES = bogofilter.c bogofilter.h lock.c lock.h main.c lexer_l.l lexer.h lexertest_SOURCES = lexer_l.l lexertest_CFLAGS = -DMAIN # what to distribute --- 57,72 ---- # what to build ! bin_PROGRAMS = bogofilter bogoutil noinst_PROGRAMS = lexertest man1_MANS = bogofilter.1 # what to build that from ! bogofilter_SOURCES = bogofilter.c bogofilter.h lock.c lock.h main.c lexer_l.l lexer.h \ ! datastore.h datastore_db.h datastore_db.c xmalloc.h xmalloc.c lexertest_SOURCES = lexer_l.l lexertest_CFLAGS = -DMAIN + + bogoutil_SOURCES = bogoutil.c datastore.h datastore_db.h datastore_db.c xmalloc.h xmalloc.c # what to distribute Index: bogofilter.c =================================================================== RCS file: /cvsroot/bogofilter/bogofilter/bogofilter.c,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** bogofilter.c 23 Sep 2002 11:35:51 -0000 1.15 --- bogofilter.c 24 Sep 2002 04:34:19 -0000 1.16 *************** *** 2,5 **** --- 2,22 ---- /* * $Log$ + * Revision 1.16 2002/09/24 04:34:19 gyepi + * + * + * Modified Files: + * Makefile.am -- add entries for datastore* + and other new files + * bogofilter.c bogofilter.h main.c -- fixup to use database abstraction + * + * Added Files: + * datastore_db.c datastore_db.h datastore.h -- database abstraction. Also implements locking + * xmalloc.c xmalloc.h -- utility + * bogoutil.c -- dump/restore utility. + * + * 1. Implements database abstraction as discussed. + * Also implements multiple readers/single writer file locking. + * + * 2. Adds utility to dump/restore databases. + * * Revision 1.15 2002/09/23 11:35:51 m-a * Fix GCC 3.2 warnings. *************** *** 109,356 **** #define min(x, y) (((x) < (y)) ? (x) : (y)) ! wordlist_t ham_list = {"ham", NULL, 0, NULL, NULL}; ! wordlist_t spam_list = {"spam", NULL, 0, NULL, NULL}; #define PLURAL(count) ((count == 1) ? "" : "s") ! #define DBT_init(dbt) do { memset(&dbt, 0, sizeof(DBT)); } while(0) ! #define char2DBT(dbt,ptr) do { dbt.data = ptr; dbt.size = strlen(ptr); } while(0) ! ! #define x2DBT(dbt,val,type) do { dbt.data = &val; dbt.size = sizeof(type); } while(0) ! ! #define long2DBT(dbt,val) x2DBT(dbt,val,long) ! #define int2DBT(dbt,val) x2DBT(dbt,val,int) ! ! long get_word_value(char *word, wordlist_t *list) ! { ! DB *dbp; ! DBT key; ! DBT data; ! int ret; ! ! DBT_init(key); ! DBT_init(data); ! ! char2DBT(key, word); ! ! dbp = list->db; ! ! if ((ret = dbp->get(dbp, NULL, &key, &data, 0)) == 0){ ! return(*(long *)data.data); ! } ! else if (ret == DB_NOTFOUND){ ! return(0); ! } ! else { ! dbp->err (dbp, ret, "bogofilter (get_word_value): %s", word); ! exit(2); ! } ! } ! ! void set_word_value(char *word, long value, wordlist_t *list) ! { ! DB *dbp; ! DBT key; ! DBT data; ! int ret; ! ! DBT_init(key); ! DBT_init(data); ! ! char2DBT(key, word); ! long2DBT(data, value); ! ! dbp = list->db; ! ! if ((ret = dbp->put(dbp, NULL, &key, &data,0)) == 0){ ! if (verbose >= 3) ! (void) printf("\"%s\" stored %ld time%s\n", word, value, PLURAL(value)); ! } ! else ! { ! dbp->err (dbp, ret, "bogofilter (set_word_value): %s", word); ! exit(2); ! } ! } ! ! static void increment(char *word, long incr, wordlist_t *list) ! /* increment a word usage count in the specified list */ ! { ! long count = get_word_value(word, list) + incr; ! count = max(count, 0); ! ! set_word_value(word, count, list); ! ! if (verbose >= 1) { ! printf("increment: '%s' has %lu hits\n",word,count); ! } ! } ! ! static int getcount(char *word, wordlist_t *list) ! /* get the count associated with a given word in a list */ ! { ! long value = get_word_value(word, list); ! ! if (value){ ! if (verbose >= 2) ! printf("getcount: '%s' has %ld %s hits in %ld\n", word, value, list->name, list->msgcount); ! } ! else { ! if (verbose >= 3) ! printf("getcount: no %s hits for %s\n", list->name, word); ! } ! ! return value; ! } ! ! int read_count(wordlist_t *list) ! /* Reads count of emails, if any. */ ! { ! FILE *infp; ! ! list->msgcount = 0; ! ! infp = fopen(list->count_file, "r"); /* Open file for reading */ ! ! if (infp == NULL) ! return 1; ! ! lock_fileno(fileno(infp), LOCK_SH); /* Lock the fole before reading */ ! fscanf(infp, HEADER, &list->msgcount); /* Read contents from the file */ ! unlock_fileno(fileno(infp)); /* Release the lock */ ! fclose(infp); ! return 0; ! } ! ! ! void write_count(wordlist_t *list) ! /* dump the count of emails to a specified file */ ! { ! FILE *outfp; ! ! outfp = fopen(list->count_file, "a+"); /* First open for append */ ! ! if (outfp == NULL) ! { ! fprintf(stderr, "bogofilter (write_count): cannot open file %s. %m", ! list->count_file); ! exit(2); ! } ! ! /* Lock file before modifying it to avoid a race condition with other ! * bogofilter instances that may want to read/modify this file */ ! lock_fileno(fileno(outfp), LOCK_EX); /* Lock the file for writing */ ! freopen(list->count_file, "w", outfp); /* Empty the file, ready to write */ ! (void) fprintf(outfp, HEADER, list->msgcount); ! unlock_fileno(fileno(outfp)); /* Unlock the file */ ! fclose(outfp); ! ! } ! ! ! int read_list(wordlist_t *list) ! /* initialize database */ ! /* return 0 if successful, and 1 if it was unsuccessful. */ ! { ! int ret; ! int fdp; /* for holding the value of the db file descriptor */ ! DB *dbp; ! list->file = strdup(list->file); ! ! dbp = malloc(sizeof(DB)); ! ! if (dbp == NULL){ ! fprintf(stderr, "bogofilter (readlist): out of memory\n"); ! return 1; ! } ! ! if ((ret = db_create (&dbp, NULL, 0)) != 0){ ! fprintf (stderr, "bogofilter (db_create): %s\n", db_strerror (ret)); ! return 1; ! } ! ! /* Lock the database file */ ! if(dbp->fd(dbp, &fdp) == 0) { /* Get file descriptor to lock */ ! if(lock_fileno(fdp,LOCK_SH) != 0) { /* Get a shared lock */ ! return(1); /* Lock attempt failed */ ! } ! } ! ! if ((ret = dbp->open (dbp, list->file, NULL, DB_BTREE, DB_CREATE, 0664)) != 0){ ! dbp->err (dbp, ret, "open: %s", list->file); ! return 1; ! } ! ! list->db = dbp; ! read_count(list); ! ! return 0; ! } ! ! void write_list(wordlist_t *list) ! /* close database */ ! { ! int fdp; /* for holding the value of the db file descriptor */ ! DB *db = list->db; ! ! write_count(list); ! ! /* Unock the database file */ ! if(db->fd(db, &fdp) == 0) { /* Get file descriptor to unlock */ ! unlock_fileno(fdp); /* Release lock */ ! } ! ! db->close(db, 0); ! } ! ! int bogodump(char *file) ! /* dump state of database */ { ! int ret; ! DB db; ! DB *dbp; ! DBC dbc; ! DBC *dbcp; ! DBT key, data; ! ! dbp = &db; ! dbcp = &dbc; ! ! if ((ret = db_create (&dbp, NULL, 0)) != 0) ! { ! fprintf (stderr, "bogodump (db_create): %s\n", db_strerror (ret)); ! return 1; ! } ! ! if ((ret = dbp->open (dbp, file, NULL, DB_BTREE, 0, 0)) != 0) ! { ! dbp->err (dbp, ret, "bogodump (open): %s", file); ! return 1; ! } ! ! if ((ret = dbp->cursor (dbp, NULL, &dbcp, 0) != 0)) ! { ! dbp->err (dbp, ret, "bogodump (cursor): %s", file); ! return 1; ! } ! ! memset (&key, 0, sizeof (DBT)); ! memset (&data, 0, sizeof (DBT)); ! ! for (;;) ! { ! ret = dbcp->c_get (dbcp, &key, &data, DB_NEXT); ! if (ret == 0){ ! printf ("%.*s:%lu\n",key.size, (char *)key.data, *(unsigned long *)data.data); ! } ! else if (ret == DB_NOTFOUND){ ! break; ! } ! else { ! dbp->err (dbp, ret, "bogodump (c_get)"); ! break; ! } ! } ! return 0; } --- 126,139 ---- #define min(x, y) (((x) < (y)) ? (x) : (y)) ! wordlist_t ham_list = {"ham", NULL, 0, NULL}; ! wordlist_t spam_list = {"spam", NULL, 0, NULL}; #define PLURAL(count) ((count == 1) ? "" : "s") ! static void strlwr(char* s) { ! char c; ! while((c = *s) != 0) ! *s++ = tolower(c); } *************** *** 366,372 **** char tokenbuffer[BUFSIZ]; // Grab tokens from the lexical analyzer into our own private Judy array yyin = fdopen(fdin, "r"); ! wordcount = msgcount = 0; for (;;) { --- 149,164 ---- char tokenbuffer[BUFSIZ]; + //FIXME -- The database locking time can be minized by using a hash table. + db_lock_writer(list->dbh); + if (other) + db_lock_writer(other->dbh); + // Grab tokens from the lexical analyzer into our own private Judy array yyin = fdopen(fdin, "r"); ! msgcount = wordcount = 0; ! ! list->msgcount = db_getcount(list->dbh); ! if (other) other->msgcount = db_getcount(other->dbh); ! for (;;) { *************** *** 387,391 **** { list->msgcount++; ! msgcount++; if (other && other->msgcount > 0) other->msgcount--; --- 179,183 ---- { list->msgcount++; ! msgcount++; if (other && other->msgcount > 0) other->msgcount--; *************** *** 404,410 **** freq = MAX_REPEATS; ! increment(tokenbuffer, freq, list); if (other) ! increment(tokenbuffer, -freq, other); } JudySLFreeArray(&PArray, &JError); --- 196,202 ---- freq = MAX_REPEATS; ! db_increment(list->dbh, tokenbuffer, freq); if (other) ! db_increment(other->dbh, tokenbuffer, -freq); } JudySLFreeArray(&PArray, &JError); *************** *** 419,422 **** --- 211,230 ---- } } + + db_setcount(list->dbh, list->msgcount); + db_flush(list->dbh); + if (verbose) + fprintf(stderr, "bogofilter: %lu messages on the %s list\n", list->msgcount, list->name); + + if (other){ + db_setcount(other->dbh, other->msgcount); + if (verbose) + fprintf(stderr, "bogofilter: %lu messages on the %s list\n", other->msgcount, other->name); + + db_flush(other->dbh); + db_lock_release(other->dbh); + } + + db_lock_release(list->dbh); } *************** *** 484,490 **** { double prob, hamness, spamness; ! ! hamness = getcount(token, &ham_list); ! spamness = getcount(token, &spam_list); #ifdef NON_EQUIPROBABLE --- 292,298 ---- { double prob, hamness, spamness; ! ! hamness = db_getvalue(ham_list.dbh, token); ! spamness = db_getvalue(spam_list.dbh, token); #ifdef NON_EQUIPROBABLE *************** *** 544,548 **** pp->key[0] = '\0'; } ! for (loc = JudySLFirst(PArray, tokenbuffer, 0); loc != (void *) NULL; --- 352,356 ---- pp->key[0] = '\0'; } ! for (loc = JudySLFirst(PArray, tokenbuffer, 0); loc != (void *) NULL; *************** *** 600,608 **** spamicity = product / (product + invproduct); if (verbose>1) ! printf("# %f %f %s\n", pp->prob, spamicity, pp->key); } if (verbose) ! printf("# Spamicity of %f\n", spamicity); return spamicity; --- 408,416 ---- spamicity = product / (product + invproduct); if (verbose>1) ! fprintf(stderr, "# %f %f %s\n", pp->prob, spamicity, pp->key); } if (verbose) ! fprintf(stderr, "# Spamicity of %f\n", spamicity); return spamicity; *************** *** 619,623 **** // tokenize input text and save words in a Judy array. PArray = collect_words(fd); ! // select the best spam/nonspam indicators. stats = select_indicators(PArray); --- 427,437 ---- // tokenize input text and save words in a Judy array. PArray = collect_words(fd); ! ! db_lock_reader(ham_list.dbh); ! db_lock_reader(spam_list.dbh); ! ! ham_list.msgcount = db_getcount(ham_list.dbh); ! spam_list.msgcount = db_getcount(spam_list.dbh); ! // select the best spam/nonspam indicators. stats = select_indicators(PArray); *************** *** 625,628 **** --- 439,445 ---- // computes the spamicity of the spam/nonspam indicators. spamicity = compute_spamicity(stats); + + db_lock_release(spam_list.dbh); + db_lock_release(ham_list.dbh); status = (spamicity > SPAM_CUTOFF) ? RC_SPAM : RC_NONSPAM; Index: bogofilter.h =================================================================== RCS file: /cvsroot/bogofilter/bogofilter/bogofilter.h,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** bogofilter.h 23 Sep 2002 11:31:53 -0000 1.5 --- bogofilter.h 24 Sep 2002 04:34:19 -0000 1.6 *************** *** 2,5 **** --- 2,22 ---- /* * $Log$ + * Revision 1.6 2002/09/24 04:34:19 gyepi + * + * + * Modified Files: + * Makefile.am -- add entries for datastore* + and other new files + * bogofilter.c bogofilter.h main.c -- fixup to use database abstraction + * + * Added Files: + * datastore_db.c datastore_db.h datastore.h -- database abstraction. Also implements locking + * xmalloc.c xmalloc.h -- utility + * bogoutil.c -- dump/restore utility. + * + * 1. Implements database abstraction as discussed. + * Also implements multiple readers/single writer file locking. + * + * 2. Adds utility to dump/restore databases. + * * Revision 1.5 2002/09/23 11:31:53 m-a * Unnest comments, and move $ line down by one to prevent CVS from adding nested comments again. *************** *** 29,36 **** { char *name; // resource name (for debug/verbose messages) ! void *db; // database handle unsigned long msgcount; // count of messages in wordlist. char *file; // associated file - char *count_file; // file for counting emails } wordlist_t; --- 46,52 ---- { char *name; // resource name (for debug/verbose messages) ! void *dbh; // database handle unsigned long msgcount; // count of messages in wordlist. char *file; // associated file } wordlist_t; Index: main.c =================================================================== RCS file: /cvsroot/bogofilter/bogofilter/main.c,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** main.c 23 Sep 2002 11:34:30 -0000 1.10 --- main.c 24 Sep 2002 04:34:19 -0000 1.11 *************** *** 2,5 **** --- 2,22 ---- /* * $Log$ + * Revision 1.11 2002/09/24 04:34:19 gyepi + * + * + * Modified Files: + * Makefile.am -- add entries for datastore* + and other new files + * bogofilter.c bogofilter.h main.c -- fixup to use database abstraction + * + * Added Files: + * datastore_db.c datastore_db.h datastore.h -- database abstraction. Also implements locking + * xmalloc.c xmalloc.h -- utility + * bogoutil.c -- dump/restore utility. + * + * 1. Implements database abstraction as discussed. + * Also implements multiple readers/single writer file locking. + * + * 2. Adds utility to dump/restore databases. + * * Revision 1.10 2002/09/23 11:34:30 relson * Modify passthrough code so that X-Spam-Status line will also print in verbose mode. *************** *** 60,63 **** --- 77,81 ---- #endif #include "bogofilter.h" + #include "datastore.h" #define BOGODIR "/.bogofilter/" *************** *** 65,83 **** #define SPAMFILE "spamlist.db" - #define HAMCOUNTFILE "hamlist.count" - #define SPAMCOUNTFILE "spamlist.count" - int verbose, passthrough; int main(int argc, char **argv) { ! int ch, dump = 0; int register_spam = 0, register_ham = 0; int spam_to_ham = 0, ham_to_spam = 0; char hamfile[PATH_MAX], spamfile[PATH_MAX], directory[PATH_MAX]; - char hamcountfile[PATH_MAX], spamcountfile[PATH_MAX]; char *tmp; struct stat sb; ! int readerror=0; if ( (tmp = getenv("HOME")) != NULL ) { --- 83,97 ---- #define SPAMFILE "spamlist.db" int verbose, passthrough; int main(int argc, char **argv) { ! int ch; int register_spam = 0, register_ham = 0; int spam_to_ham = 0, ham_to_spam = 0; char hamfile[PATH_MAX], spamfile[PATH_MAX], directory[PATH_MAX]; char *tmp; struct stat sb; ! int exitcode = 0; if ( (tmp = getenv("HOME")) != NULL ) { *************** *** 86,90 **** strcat(directory, BOGODIR); ! while ((ch = getopt(argc, argv, "d:shSHvVpl")) != EOF) switch(ch) { --- 100,104 ---- strcat(directory, BOGODIR); ! while ((ch = getopt(argc, argv, "d:shSHvVp")) != EOF) switch(ch) { *************** *** 128,135 **** passthrough = 1; break; - - case 'l': - dump = 1; - break; } --- 142,145 ---- *************** *** 151,203 **** spam_list.file = spamfile; - strcpy(hamcountfile, directory); - strcat(hamcountfile, HAMCOUNTFILE); - ham_list.count_file = hamcountfile; - - strcpy(spamcountfile, directory); - strcat(spamcountfile, SPAMCOUNTFILE); - spam_list.count_file = spamcountfile; - - readerror += read_list(&ham_list); - readerror += read_list(&spam_list); ! /* readerror is ok, but only if using register_* modes */ ! if (readerror && ! (register_spam || register_ham)) { ! fprintf(stderr, "Error: can't open list file(s).\n"); ! exit(2); } ! ! if (dump) ! { ! if (register_ham) ! bogodump(hamfile); ! else if (register_spam) ! bogodump(spamfile); } ! else if (register_spam) { register_words(STDIN_FILENO, &spam_list, NULL); - write_list(&spam_list); - if (verbose) - printf("bogofilter: %lu messages on the spam list\n", spam_list.msgcount); } else if (register_ham) { register_words(STDIN_FILENO, &ham_list, NULL); - write_list(&ham_list); - if (verbose) - printf("bogofilter: %lu messages on the ham list\n", ham_list.msgcount); } else if (spam_to_ham) { register_words(STDIN_FILENO, &ham_list, &spam_list); - write_list(&ham_list); - write_list(&spam_list); } else if (ham_to_spam) { register_words(STDIN_FILENO, &spam_list, &ham_list); - write_list(&ham_list); - write_list(&spam_list); } else --- 161,192 ---- spam_list.file = spamfile; ! if ( (ham_list.dbh = db_open(ham_list.file, ham_list.name)) == NULL){ ! fprintf(stderr, "bogofilter: Cannot initialize database %s.\n", ham_list.name); ! exit(2); } ! ! if ( (spam_list.dbh = db_open(spam_list.file, spam_list.name)) == NULL){ ! fprintf(stderr, "bogofilter: Cannot initialize database %s.\n", spam_list.name); ! db_close(ham_list.dbh); ! exit(2); } ! ! ! if (register_spam) { register_words(STDIN_FILENO, &spam_list, NULL); } else if (register_ham) { register_words(STDIN_FILENO, &ham_list, NULL); } else if (spam_to_ham) { register_words(STDIN_FILENO, &ham_list, &spam_list); } else if (ham_to_spam) { register_words(STDIN_FILENO, &spam_list, &ham_list); } else *************** *** 238,245 **** } } ! exit(status); } ! exit(0); } --- 227,238 ---- } } ! ! exitcode = status; } ! db_close(spam_list.dbh); ! db_close(ham_list.dbh); ! ! exit(exitcode); } |