[cvs] bogofilter/src lexer.h,1.23,1.24 lexer_v3.l,1.79,1.80 token.c,1.55,1.56
Fast Bayesian spam filter along lines suggested by Paul Graham
Brought to you by:
m-a
From: <re...@us...> - 2003-09-10 01:04:18
|
Update of /cvsroot/bogofilter/bogofilter/src In directory sc8-pr-cvs1:/tmp/cvs-serv29308 Modified Files: lexer.h lexer_v3.l token.c Log Message: Revise parsing rules to discard likely singletons. Index: lexer.h =================================================================== RCS file: /cvsroot/bogofilter/bogofilter/src/lexer.h,v retrieving revision 1.23 retrieving revision 1.24 diff -u -d -r1.23 -r1.24 --- lexer.h 8 Sep 2003 12:14:19 -0000 1.23 +++ lexer.h 10 Sep 2003 01:04:15 -0000 1.24 @@ -27,6 +27,7 @@ typedef enum { NONE, TOKEN, /* regular token */ + HEADKEY, /* header keyword */ EOH, /* end-of-header (empty line) */ BOUNDARY, /* MIME multipart boundary line */ IPADDR, /* ip address */ Index: lexer_v3.l =================================================================== RCS file: /cvsroot/bogofilter/bogofilter/src/lexer_v3.l,v retrieving revision 1.79 retrieving revision 1.80 diff -u -d -r1.79 -r1.80 --- lexer_v3.l 7 Sep 2003 01:10:25 -0000 1.79 +++ lexer_v3.l 10 Sep 2003 01:04:15 -0000 1.80 @@ -119,7 +119,7 @@ BCHARS [0-9a-zA-Z'()+_,-./:=?# ] MIME_BOUNDARY {BCHARS}{0,69}{BCHARSNOSPC} -ID [0-9a-zA-Z-]* +ID <?[0-9a-zA-Z-]*>? CHARSET [0-9a-zA-Z-]+ MTYPE [ \t]*[0-9a-zA-Z/-]* TTYPE [ \t]*[\.0-9a-zA-Z/-]* @@ -215,17 +215,19 @@ <INITIAL>^(To|From|Return-Path|Subject): { if (header_line_markup) set_tag(yy_text()); } <INITIAL>^Content-(Transfer-Encoding|Type|Disposition):{MTYPE} { mime_content(yy_text()); skip_to(':'); return TOKEN; } -<INITIAL>^MIME-Version:.* { mime_version(yy_text()); skip_to(':'); return TOKEN; } +<INITIAL>^MIME-Version:.* { mime_version(yy_text()); skip_to(':'); return TOKEN; } -<INITIAL>^Message-ID:.* ; -<INITIAL>^(Delivery-)?Date:.* ; +<INITIAL>^(Delivery-)?Date:.* /* ignore */ +<INITIAL>^(Resent-)?Message-ID:.* /* ignore */ + +<INITIAL>^(In-Reply-To|References):.* { return HEADKEY; } <INITIAL>boundary=[ ]*\"?{MIME_BOUNDARY}\"? { mime_boundary_set(yy_text()); } -<INITIAL>charset=\"?{CHARSET}\"? { got_charset(yytext); skip_to('='); return TOKEN; } +<INITIAL>charset=\"?{CHARSET}\"? { got_charset(yytext); skip_to('='); return TOKEN; } -<INITIAL>(file)?name=\"? -<INITIAL>(ESMTP|SMTP)+[ \t\n]+id\ {ID} ; -<INITIAL>[:blank:]*id\ {ID} ; +<INITIAL>(file)?name=\"? /* ignore */ +<INITIAL>(ESMTP|SMTP)+[ \t\n]+id\ {ID} /* ignore */ +<INITIAL>[:blank:]*id\ {ID} /* ignore */ <INITIAL>\n[ \t] { lineno += 1; } <INITIAL>\n\n { if (get_content_type() == MIME_TEXT_HTML) @@ -274,7 +276,7 @@ \${DOLLARS}(\.{CENTS})? { return TOKEN;} -. ; /* ignore character */ +. /* ignore character */ \n { lineno += 1; clr_tag(); } %% Index: token.c =================================================================== RCS file: /cvsroot/bogofilter/bogofilter/src/token.c,v retrieving revision 1.55 retrieving revision 1.56 diff -u -d -r1.55 -r1.56 --- token.c 6 Sep 2003 20:50:39 -0000 1.55 +++ token.c 10 Sep 2003 01:04:15 -0000 1.56 @@ -100,6 +100,15 @@ case BOUNDARY: /* don't return boundary tokens to the user */ continue; + case HEADKEY: + { + const char *delim = index((const char *)yylval->text, ':'); + yylval->leng = delim - (const char *)yylval->text; + Z(yylval->text[yylval->leng]); + } + + /*@fallthrough@*/ + case TOKEN: /* ignore anything when not reading text MIME types */ if (token_prefix != NULL) { word_t *w = word_concat(token_prefix, yylval); |