[cvs] bogofilter/src bogoutil.c,1.49.6.3,1.49.6.4 datastore.c,1.3.12.3,1.3.12.4 datastore_db.c,1.37.
Fast Bayesian spam filter along lines suggested by Paul Graham
Brought to you by:
m-a
From: <re...@us...> - 2003-08-07 22:51:52
|
Update of /cvsroot/bogofilter/bogofilter/src In directory sc8-pr-cvs1:/tmp/cvs-serv25586 Modified Files: Tag: DatastoreAPI bogoutil.c datastore.c datastore_db.c datastore_db.h Log Message: Define and use datastore handle type (dsh_t). Index: bogoutil.c =================================================================== RCS file: /cvsroot/bogofilter/bogofilter/src/bogoutil.c,v retrieving revision 1.49.6.3 retrieving revision 1.49.6.4 diff -u -d -r1.49.6.3 -r1.49.6.4 --- bogoutil.c 7 Aug 2003 22:48:44 -0000 1.49.6.3 +++ bogoutil.c 7 Aug 2003 22:51:48 -0000 1.49.6.4 @@ -465,7 +465,7 @@ rh.sum = ∑ rh.count = &tok_cnt; - ds_foreach(dbh, robx_hook, &rh); + ds_foreach(dbh, robx_hook, &rh); robx = sum/tok_cnt; if (verbose) @@ -509,6 +509,8 @@ fprintf(stderr, "%s: string too long creating .db file name.\n", PROGNAME); exit(EX_ERROR); } + + run_type = REG_SPAM; dbh = ds_open(".", count, (const char **) filepaths, DB_WRITE); if (dbh == NULL) Index: datastore.c =================================================================== RCS file: /cvsroot/bogofilter/bogofilter/src/datastore.c,v retrieving revision 1.3.12.3 retrieving revision 1.3.12.4 diff -u -d -r1.3.12.3 -r1.3.12.4 --- datastore.c 7 Aug 2003 15:57:50 -0000 1.3.12.3 +++ datastore.c 7 Aug 2003 22:51:48 -0000 1.3.12.4 @@ -49,36 +49,58 @@ /* Function definitions */ -static void convert_external_to_internal(dbv_t *ex_data, dsv_t *in_data, bool is_swapped) +static void convert_external_to_internal(dsh_t *dsh, dbv_t *ex_data, dsv_t *in_data) { size_t i = 0; uint32_t *cv = ex_data->data; - in_data->spamcount = !is_swapped ? cv[i++] : swap_32bit(cv[i++]); + if (dsh->count == 1) { + in_data->spamcount = !dsh->is_swapped ? cv[i++] : swap_32bit(cv[i++]); - if (ex_data->leng <= i * sizeof(uint32_t)) - in_data->goodcount = 0; - else - in_data->goodcount = !is_swapped ? cv[i++] : swap_32bit(cv[i++]); + if (ex_data->leng <= i * sizeof(uint32_t)) + in_data->goodcount = 0; + else + in_data->goodcount = !dsh->is_swapped ? cv[i++] : swap_32bit(cv[i++]); - if (ex_data->leng <= i * sizeof(uint32_t)) - in_data->date = 0; + if (ex_data->leng <= i * sizeof(uint32_t)) + in_data->date = 0; + else { + in_data->date = !dsh->is_swapped ? cv[i++] : swap_32bit(cv[i++]); + } + } else { - in_data->date = !is_swapped ? cv[i++] : swap_32bit(cv[i++]); + in_data->count[dsh->index] = !dsh->is_swapped ? cv[i++] : swap_32bit(cv[i++]); + + if (ex_data->leng <= i * sizeof(uint32_t)) + in_data->date = 0; + else { + YYYYMMDD date = !dsh->is_swapped ? cv[i++] : swap_32bit(cv[i++]); + in_data->date = max(in_data->date, date); + } } return; } -static void convert_internal_to_external(dsv_t *in_data, dbv_t *ex_data, bool is_swapped) +static void convert_internal_to_external(dsh_t *dsh, dsv_t *in_data, dbv_t *ex_data) { size_t i = 0; uint32_t *cv = ex_data->data; - cv[i++] = !is_swapped ? in_data->spamcount : swap_32bit(in_data->spamcount); - cv[i++] = !is_swapped ? in_data->goodcount : swap_32bit(in_data->goodcount); + /* Writing requires extra magic since the counts may need to be + ** separated for output to different wordlists. + */ + + if (dsh->count == 1) { + cv[i++] = !dsh->is_swapped ? in_data->spamcount : swap_32bit(in_data->spamcount); + cv[i++] = !dsh->is_swapped ? in_data->goodcount : swap_32bit(in_data->goodcount); + } + else { + cv[i++] = !dsh->is_swapped ? in_data->count[dsh->index] : swap_32bit(in_data->count[dsh->index]); + } + if (datestamp_tokens || in_data->date != 0) - cv[i++] = !is_swapped ? in_data->date : swap_32bit(in_data->date); + cv[i++] = !dsh->is_swapped ? in_data->date : swap_32bit(in_data->date); ex_data->leng = i * sizeof(cv[0]); @@ -93,8 +115,8 @@ { dsh_t *val = xmalloc(sizeof(*val)); val->dbh = dbh; - val->count = count; val->index = 0; + val->count = count; val->is_swapped = is_swapped; return val; } @@ -130,7 +152,7 @@ int ds_read(void *vhandle, const word_t *word, /*@out@*/ dsv_t *val) { dsh_t *dsh = vhandle; - int ret; + bool found = false; dbv_t ex_key; dbv_t ex_data; uint32_t cv[3]; @@ -144,29 +166,47 @@ ex_data.data = cv; ex_data.leng = ex_data.size = sizeof(cv); - ret = db_get_dbvalue(dsh, &ex_key, &ex_data); + memset(val, 0, sizeof(*val)); - if (ret == 0) { - convert_external_to_internal(&ex_data, val, dsh->is_swapped); + for (dsh->index = 0; dsh->index < dsh->count; dsh->index += 1) { + int ret = db_get_dbvalue(dsh, &ex_key, &ex_data); - if (DEBUG_DATABASE(3)) { - fprintf(dbgout, "ds_read: [%*s] -- %lu,%lu\n", - word->leng, word->text, - (unsigned long)val->spamcount, - (unsigned long)val->goodcount); + switch (ret) { + case 0: + found = true; + + convert_external_to_internal(dsh, &ex_data, val); + + if (DEBUG_DATABASE(3)) { + fprintf(dbgout, "ds_read: [%*s] -- %lu,%lu\n", + word->leng, word->text, + (unsigned long)val->spamcount, + (unsigned long)val->goodcount); + } + break; + + case DB_NOTFOUND: + if (DEBUG_DATABASE(3)) { + fprintf(dbgout, "db_get_dbvalue: [%*s] not found\n", + word->leng, (char *) word->text); + } + break; + + default: + print_error(__FILE__, __LINE__, "(db) db_get_dbvalue( '%*s' ), err: %d, %s", + word->leng, (char *) word->text, ret, db_strerror(ret)); + exit(EX_ERROR); } - } else { - memset(val, 0, sizeof(*val)); } - return ret; + return found ? 0 : 1; } int ds_write(void *vhandle, const word_t *word, dsv_t *val) { + int ret; dsh_t *dsh = vhandle; - bool ok; dbv_t ex_key; dbv_t ex_data; uint32_t cv[3]; @@ -184,20 +224,37 @@ if (datestamp_tokens || today != 0) val->date = today; - convert_internal_to_external(val, &ex_data, dsh->is_swapped); + for (dsh->index = 0; dsh->index < dsh->count; dsh->index += 1) { - ok = db_set_dbvalue(dsh, &ex_key, &ex_data); + /* With two wordlists, it's necessary to check index and + ** run_type to avoid writing all tokens to both lists. */ + if (dsh->count == 2) { + bool ok; + /* if index is spamlist, but not writing to spamlist ... */ + /* if index is goodlist, but not writing to goodlist ... */ + ok = ((dsh->index == SPAM && (run_type & (REG_SPAM | UNREG_SPAM))) || + (dsh->index == GOOD && (run_type & (REG_GOOD | UNREG_GOOD)))); + if (!ok) + continue; + } - if (ok) { - if (DEBUG_DATABASE(3)) { - fprintf(dbgout, "ds_write: [%*s] -- %lu,%lu\n", - word->leng, word->text, - (unsigned long)val->spamcount, - (unsigned long)val->goodcount); + convert_internal_to_external(dsh, val, &ex_data); + + ret = db_set_dbvalue(dsh, &ex_key, &ex_data); + + if (ret == 0) { + if (DEBUG_DATABASE(3)) { + fprintf(dbgout, "ds_write: [%*s] -- %lu,%lu\n", + word->leng, word->text, + (unsigned long)val->spamcount, + (unsigned long)val->goodcount); + } + } else { + break; } } - return ok ? 0 : 1; + return ret; } @@ -231,9 +288,13 @@ word_t w_key; dsv_t in_data; ds_userdata_t *ds_data = userdata; + dsh_t *dsh = ds_data->dsh; + w_key.text = ex_key->data; w_key.leng = ex_key->leng; - convert_external_to_internal(ex_data, &in_data, ds_data->dsh->is_swapped); + + convert_external_to_internal(dsh, ex_data, &in_data); + val = (*ds_data->hook)(&w_key, &in_data, ds_data->data); return val; } Index: datastore_db.c =================================================================== RCS file: /cvsroot/bogofilter/bogofilter/src/datastore_db.c,v retrieving revision 1.37.4.3 retrieving revision 1.37.4.4 diff -u -d -r1.37.4.3 -r1.37.4.4 --- datastore_db.c 7 Aug 2003 15:57:50 -0000 1.37.4.3 +++ datastore_db.c 7 Aug 2003 22:51:48 -0000 1.37.4.4 @@ -115,9 +115,9 @@ } -void dbh_print_names(dsh_t *dsh, const char *msg) +void dbh_print_names(void *vhandle, const char *msg) { - dbh_t *handle = dsh->dbh; + dbh_t *handle = vhandle; if (handle->count == 1) fprintf(dbgout, "%s (%s)", msg, handle->name[0]); @@ -309,11 +309,11 @@ { int ret; bool found = false; - size_t i; DBT db_key; DBT db_data; dbh_t *handle = dsh->dbh; + DB *dbp = handle->dbp[dsh->index]; db_enforce_locking(handle, "db_get_dbvalue"); @@ -328,29 +328,24 @@ db_data.ulen = val->size; db_data.flags = DB_DBT_USERMEM; /* saves the memcpy */ - for (i = 0; i < handle->count; i += 1) { - DB *dbp = handle->dbp[i]; - ret = dbp->get(dbp, NULL, &db_key, &db_data, 0); + ret = dbp->get(dbp, NULL, &db_key, &db_data, 0); - switch (ret) { - case 0: - found = true; - break; - case DB_NOTFOUND: - if (handle->count != 1) { - uint32_t *count = token->data; - count[i] = 0; - } - if (DEBUG_DATABASE(3)) { - fprintf(dbgout, "db_get_dbvalue: [%*s] not found\n", - token->leng, (char *) token->data); - } - break; - default: - print_error(__FILE__, __LINE__, "(db) db_get_dbvalue( '%*s' ), err: %d, %s", - token->leng, (char *) token->data, ret, db_strerror(ret)); - exit(EX_ERROR); + val->leng = db_data.size; + + switch (ret) { + case 0: + found = true; + break; + case DB_NOTFOUND: + if (DEBUG_DATABASE(3)) { + fprintf(dbgout, "db_get_dbvalue: [%*s] not found\n", + token->leng, (char *) token->data); } + break; + default: + print_error(__FILE__, __LINE__, "(db) db_get_dbvalue( '%*s' ), err: %d, %s", + token->leng, (char *) token->data, ret, db_strerror(ret)); + exit(EX_ERROR); } return found ? 0 : DB_NOTFOUND; @@ -359,10 +354,13 @@ int db_set_dbvalue(dsh_t *dsh, const dbv_t *token, dbv_t *val) { + int ret; + DBT db_key; DBT db_data; - size_t i; + dbh_t *handle = dsh->dbh; + DB *dbp = handle->dbp[dsh->index]; db_enforce_locking(handle, "db_set_dbvalue"); @@ -375,17 +373,12 @@ db_data.data = val->data; db_data.size = val->leng; - for (i = 0; i < handle->count; i += 1) { - int ret; - DB *dbp = handle->dbp[i]; - - ret = dbp->put(dbp, NULL, &db_key, &db_data, 0); + ret = dbp->put(dbp, NULL, &db_key, &db_data, 0); - if (ret != 0) { - print_error(__FILE__, __LINE__, "(db) db_set_dbvalue( '%*s' ), err: %d, %s", - token->size, (char *)token->data, ret, db_strerror(ret)); - exit(EX_ERROR); - } + if (ret != 0) { + print_error(__FILE__, __LINE__, "(db) db_set_dbvalue( '%*s' ), err: %d, %s", + token->size, (char *)token->data, ret, db_strerror(ret)); + exit(EX_ERROR); } return 0; @@ -416,6 +409,8 @@ for (i = 0; i < handle->count; i += 1) { DB *dbp = handle->dbp[i]; + if (dbp == NULL) + continue; if ((ret = dbp->close(dbp, f))) print_error(__FILE__, __LINE__, "(db) db_close err: %d, %s", ret, db_strerror(ret)); } Index: datastore_db.h =================================================================== RCS file: /cvsroot/bogofilter/bogofilter/src/Attic/datastore_db.h,v retrieving revision 1.1.2.2 retrieving revision 1.1.2.3 diff -u -d -r1.1.2.2 -r1.1.2.3 --- datastore_db.h 7 Aug 2003 15:57:50 -0000 1.1.2.2 +++ datastore_db.h 7 Aug 2003 22:51:49 -0000 1.1.2.3 @@ -78,7 +78,7 @@ int db_lock(int fd, int cmd, short int type); /* Prints wordlist name(s) */ -void dbh_print_names(dsh_t *dsh, const char *msg); +void dbh_print_names(void *handle, const char *msg); /* Returns version string */ const char *db_version_str(void); |