[cvs] bogofilter/src bogoconfig.c,1.54,1.55 bogolexer.c,1.21,1.22 globals.c,1.16,1.17 globals.h,1.18
Fast Bayesian spam filter along lines suggested by Paul Graham
Brought to you by:
m-a
From: <re...@us...> - 2003-05-12 19:29:50
|
Update of /cvsroot/bogofilter/bogofilter/src In directory sc8-pr-cvs1:/tmp/cvs-serv8039 Modified Files: bogoconfig.c bogolexer.c globals.c globals.h token.c Log Message: Allow case folding to be turned off by "fold_case" config file option and '-Pf' command line switch. Add '-Ph' switch to toggle tagging of header lines. Rename parsing switches from "-Hx" to "-Px". Index: bogoconfig.c =================================================================== RCS file: /cvsroot/bogofilter/bogofilter/src/bogoconfig.c,v retrieving revision 1.54 retrieving revision 1.55 diff -u -d -r1.54 -r1.55 --- bogoconfig.c 11 May 2003 15:44:28 -0000 1.54 +++ bogoconfig.c 12 May 2003 19:29:44 -0000 1.55 @@ -156,6 +156,7 @@ { "tag_header_lines", CP_BOOLEAN, { (void *) &tag_header_lines } }, { "strict_check", CP_BOOLEAN, { (void *) &strict_check } }, + { "fold_case", CP_BOOLEAN, { (void *) &fold_case } }, { "tokenize_html_tags", CP_BOOLEAN, { (void *) &tokenize_html_tags } }, { "tokenize_html_script", CP_BOOLEAN, { (void *) &tokenize_html_script } }, /* Not yet in use */ { "tokenize_html_comments", CP_BOOLEAN, { (void *) &tokenize_html_comments } },/* Not yet in use */ @@ -311,7 +312,7 @@ "\t -2 - set binary classification mode (yes/no).\n" "\t -3 - set ternary classification mode (yes/no/unsure).\n"); (void)fprintf(stderr, - "\t -H {opts} - set html processing flag(s).\n" + "\t -P {opts} - set html processing flag(s).\n" "\t where {opts} is one or more of:\n" "\t C - enable strict comment checking (default is loose checking).\n" "\t t - return tokens from inside html tags.\n" @@ -442,7 +443,7 @@ #if HAVE_DECL_OPTRESET optreset = 1; #endif - while ((option = getopt(argc, argv, ":23bBc:Cd:DefFghH:I:lL:m:MnNo:O:pqQRrsStTuvVx:y:" G R F)) != -1) + while ((option = getopt(argc, argv, ":23bBc:Cd:DefFghI:lL:m:MnNo:O:pP:qQRrsStTuvVx:y:" G R F)) != -1) { #if 0 if (getenv("BOGOFILTER_DEBUG_OPTIONS")) { @@ -517,21 +518,28 @@ help(); exit(0); - case 'H': + case 'P': { char *s; for (s = optarg; *s && pass == 2; s += 1) { switch (*s) { - case 't': tokenize_html_tags ^= true; + case 't': tokenize_html_tags ^= true; /* -Ht */ break; - case 's': tokenize_html_script ^= true; /* Not yet in use */ + case 's': tokenize_html_script ^= true; /* -Hs - not yet in use */ break; - case 'C': strict_check ^= true; + case 'C': strict_check ^= true; /* -HC */ /*@fallthrough@*/ - case 'c': tokenize_html_comments ^= true; /* Not yet in use */ + case 'c': tokenize_html_comments ^= true; /* -Hc - not yet in use */ + break; + case 'h': tag_header_lines ^= true; /* -Hh */ break; + case 'f': fold_case ^= true; /* -Hf */ + break; + default: + fprintf(stderr, "Unknown parsing option -H%c.\n", *s); + exit(2); } } break; Index: bogolexer.c =================================================================== RCS file: /cvsroot/bogofilter/bogofilter/src/bogolexer.c,v retrieving revision 1.21 retrieving revision 1.22 diff -u -d -r1.21 -r1.22 --- bogolexer.c 2 May 2003 23:03:05 -0000 1.21 +++ bogolexer.c 12 May 2003 19:29:44 -0000 1.22 @@ -80,7 +80,7 @@ "\t-x list\t- set debug flags.\n" "\t-D\t- direct debug output to stdout.\n"); fprintf(stderr, - "\t -H {opts} - set html processing flag(s).\n" + "\t -P {opts} - set html processing flag(s).\n" "\t where {opts} is one or more of:\n" "\t C - enable strict comment checking (default is loose checking).\n" "\t t - return tokens from inside html tags.\n" @@ -100,7 +100,7 @@ fpin = stdin; dbgout = stderr; - while ((option = getopt(argc, argv, ":c:CDhH:I:npqTvx:")) != -1) + while ((option = getopt(argc, argv, ":c:CDhI:npP:qTvx:")) != -1) { switch (option) { @@ -129,21 +129,28 @@ help(); exit(0); - case 'H': + case 'P': { char *s; for (s = optarg; *s ; s += 1) { switch (*s) { - case 't': tokenize_html_tags ^= true; + case 't': tokenize_html_tags ^= true; /* -Pt */ break; - case 's': tokenize_html_script ^= true; /* Not yet in use */ + case 's': tokenize_html_script ^= true; /* -Ps - not yet in use */ break; - case 'C': strict_check ^= true; + case 'C': strict_check ^= true; /* -PC */ /*@fallthrough@*/ - case 'c': tokenize_html_comments ^= true; /* Not yet in use */ + case 'c': tokenize_html_comments ^= true; /* -Pc - not yet in use */ + break; + case 'h': tag_header_lines ^= true; /* -Ph */ break; + case 'f': fold_case ^= true; /* -Pf */ + break; + default: + fprintf(stderr, "Unknown parsing option -P%c.\n", *s); + exit(2); } } break; Index: globals.c =================================================================== RCS file: /cvsroot/bogofilter/bogofilter/src/globals.c,v retrieving revision 1.16 retrieving revision 1.17 diff -u -d -r1.16 -r1.17 --- globals.c 2 May 2003 23:03:04 -0000 1.16 +++ globals.c 12 May 2003 19:29:44 -0000 1.17 @@ -42,16 +42,17 @@ double min_dev; double spam_cutoff; double thresh_stats; -bool tag_header_lines = false; /* true */ const char *update_dir; /*@observer@*/ const char *stats_prefix; /* for lexer_v3.l */ -bool tokenize_html_tags = false; -bool tokenize_html_script = false; /* Not yet in use */ -bool tokenize_html_comments = false; /* Not yet in use */ +bool fold_case = true; /* -Pf */ +bool tag_header_lines = false; /* -Ph */ +bool tokenize_html_tags = false; /* -Pt */ +bool tokenize_html_script = false; /* -Ps - not yet in use */ +bool tokenize_html_comments = false; /* -Pc - Not yet in use */ /* dual definition options */ char *directory; /* '-d' */ Index: globals.h =================================================================== RCS file: /cvsroot/bogofilter/bogofilter/src/globals.h,v retrieving revision 1.18 retrieving revision 1.19 diff -u -d -r1.18 -r1.19 --- globals.h 2 May 2003 23:03:04 -0000 1.18 +++ globals.h 12 May 2003 19:29:44 -0000 1.19 @@ -43,12 +43,13 @@ extern int abort_on_error; extern bool stats_in_header; -extern bool tag_header_lines; /* for lexer_v3.l */ -extern bool tokenize_html_tags; -extern bool tokenize_html_script; -extern bool tokenize_html_comments; +extern bool fold_case; /* -Pf */ +extern bool tag_header_lines; /* -Ph */ +extern bool tokenize_html_tags; /* -Pt */ +extern bool tokenize_html_script; /* -Ps */ +extern bool tokenize_html_comments; /* -Pc */ extern int db_cachesize; Index: token.c =================================================================== RCS file: /cvsroot/bogofilter/bogofilter/src/token.c,v retrieving revision 1.16 retrieving revision 1.17 diff -u -d -r1.16 -r1.17 --- token.c 10 May 2003 14:50:24 -0000 1.16 +++ token.c 12 May 2003 19:29:44 -0000 1.17 @@ -69,6 +69,11 @@ yylval->leng = lexer_v3_leng; yylval->text = (byte *)lexer_v3_text; + if (DEBUG_TEXT(1)) { + word_puts(yylval, 0, dbgout); + fputc('\n', dbgout); + } + if (class <= 0) break; @@ -164,8 +169,9 @@ } /* Need separate loop so lexer can see "From", "Date", etc */ - for (cp = yylval->text; cp < yylval->text+yylval->leng; cp += 1) - *cp = casefold_table[*cp]; + if (fold_case) + for (cp = yylval->text; cp < yylval->text+yylval->leng; cp += 1) + *cp = casefold_table[*cp]; return(class); } @@ -202,12 +208,28 @@ return; } +#if 1 +const char *prefixes = "|subj:"; +#else +const char *prefixes = "|to:|from:|rtrn:|subj:"; +#endif + void set_tag(const char *tag) { if (tag_header_lines) { - word_free(token_prefix); - token_prefix = word_new((const byte *)tag, strlen(tag)); + const char *tmp; + size_t len = strlen(tag); + + for (tmp = prefixes; tmp != NULL; + (tmp = strchr(tmp, '|')) && (tmp += 1)) { + if (memcmp(tmp, tag, len) == 0) { + word_free(token_prefix); + token_prefix = word_new((const byte *)tag, strlen(tag)); + return; + } + } } + return; } /* Cleanup storage allocation */ |