[cvs] bogofilter/src lexer.h,1.23,1.24 lexer_v3.l,1.79,1.80 token.c,1.55,1.56

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Update of /cvsroot/bogofilter/bogofilter/src
In directory sc8-pr-cvs1:/tmp/cvs-serv29308

Modified Files:
	lexer.h lexer_v3.l token.c 
Log Message:
Revise parsing rules to discard likely singletons.

Index: lexer.h
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/lexer.h,v
retrieving revision 1.23
retrieving revision 1.24
diff -u -d -r1.23 -r1.24

--- lexer.h	8 Sep 2003 12:14:19 -0000	1.23
+++ lexer.h	10 Sep 2003 01:04:15 -0000	1.24
@@ -27,6 +27,7 @@
 typedef enum {
     NONE,
     TOKEN,	/* regular token */
+    HEADKEY,	/* header keyword */
     EOH,	/* end-of-header (empty line) */
     BOUNDARY,	/* MIME multipart boundary line */
     IPADDR,	/* ip address */

Index: lexer_v3.l
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/lexer_v3.l,v
retrieving revision 1.79
retrieving revision 1.80
diff -u -d -r1.79 -r1.80
--- lexer_v3.l	7 Sep 2003 01:10:25 -0000	1.79
+++ lexer_v3.l	10 Sep 2003 01:04:15 -0000	1.80
@@ -119,7 +119,7 @@
 BCHARS		[0-9a-zA-Z'()+_,-./:=?# ]
 MIME_BOUNDARY	{BCHARS}{0,69}{BCHARSNOSPC}
 
-ID		[0-9a-zA-Z-]*
+ID		<?[0-9a-zA-Z-]*>?
 CHARSET		[0-9a-zA-Z-]+
 MTYPE		[ \t]*[0-9a-zA-Z/-]*
 TTYPE		[ \t]*[\.0-9a-zA-Z/-]*
@@ -215,17 +215,19 @@
 
 <INITIAL>^(To|From|Return-Path|Subject):	{ if (header_line_markup) set_tag(yy_text()); }
 <INITIAL>^Content-(Transfer-Encoding|Type|Disposition):{MTYPE}	{ mime_content(yy_text()); skip_to(':'); return TOKEN; }
-<INITIAL>^MIME-Version:.*			{ mime_version(yy_text()); 	skip_to(':'); return TOKEN; }
+<INITIAL>^MIME-Version:.*			{ mime_version(yy_text()); skip_to(':'); return TOKEN; }
 
-<INITIAL>^Message-ID:.*				;
-<INITIAL>^(Delivery-)?Date:.*			;
+<INITIAL>^(Delivery-)?Date:.*			/* ignore */
+<INITIAL>^(Resent-)?Message-ID:.*		/* ignore */
+
+<INITIAL>^(In-Reply-To|References):.* 		{ return HEADKEY; }
 
 <INITIAL>boundary=[ ]*\"?{MIME_BOUNDARY}\"?	{ mime_boundary_set(yy_text()); }
-<INITIAL>charset=\"?{CHARSET}\"?		{ got_charset(yytext); 		skip_to('='); return TOKEN; }
+<INITIAL>charset=\"?{CHARSET}\"?		{ got_charset(yytext); skip_to('='); return TOKEN; }
 
-<INITIAL>(file)?name=\"?			
-<INITIAL>(ESMTP|SMTP)+[ \t\n]+id\ {ID}		;
-<INITIAL>[:blank:]*id\ {ID}			;
+<INITIAL>(file)?name=\"?			/* ignore */
+<INITIAL>(ESMTP|SMTP)+[ \t\n]+id\ {ID}		/* ignore */
+<INITIAL>[:blank:]*id\ {ID}			/* ignore */
 
 <INITIAL>\n[ \t]				{ lineno += 1; }
 <INITIAL>\n\n					{ if (get_content_type() == MIME_TEXT_HTML)
@@ -274,7 +276,7 @@
 
 \${DOLLARS}(\.{CENTS})?				{ return TOKEN;}
 
-.						;			/* ignore character */
+.						/* ignore character */
 \n						{ lineno += 1; clr_tag(); }
 %%
 

Index: token.c
===================================================================
RCS file: /cvsroot/bogofilter/bogofilter/src/token.c,v
retrieving revision 1.55
retrieving revision 1.56
diff -u -d -r1.55 -r1.56
--- token.c	6 Sep 2003 20:50:39 -0000	1.55
+++ token.c	10 Sep 2003 01:04:15 -0000	1.56
@@ -100,6 +100,15 @@
 	case BOUNDARY:	/* don't return boundary tokens to the user */
 	    continue;
 
+	case HEADKEY:
+	{
+	    const char *delim = index((const char *)yylval->text, ':');
+	    yylval->leng = delim - (const char *)yylval->text;
+	    Z(yylval->text[yylval->leng]);
+	}
+
+	/*@fallthrough@*/
+
 	case TOKEN:	/* ignore anything when not reading text MIME types */
 	    if (token_prefix != NULL) {
 		word_t *w = word_concat(token_prefix, yylval);






[cvs] bogofilter/src lexer.h,1.23,1.24 lexer_v3.l,1.79,1.80 token.c,1.55,1.56

Fast Bayesian spam filter along lines suggested by Paul Graham

[cvs] bogofilter/src lexer.h,1.23,1.24 lexer_v3.l,1.79,1.80 token.c,1.55,1.56