[cvs] bogofilter/src lexer_v3.l,1.9,1.10
Fast Bayesian spam filter along lines suggested by Paul Graham
Brought to you by:
m-a
From: <re...@us...> - 2003-05-02 23:03:15
|
Update of /cvsroot/bogofilter/bogofilter/src In directory sc8-pr-cvs1:/tmp/cvs-serv9226/src Modified Files: lexer_v3.l Log Message: Provide lexer support for tokenizing innards of html tags, comments, and script blocks. Index: lexer_v3.l =================================================================== RCS file: /cvsroot/bogofilter/bogofilter/src/lexer_v3.l,v retrieving revision 1.9 retrieving revision 1.10 diff -u -d -r1.9 -r1.10 --- lexer_v3.l 2 May 2003 11:23:22 -0000 1.9 +++ lexer_v3.l 2 May 2003 23:03:10 -0000 1.10 @@ -51,6 +51,7 @@ #include "buff.h" #include "charset.h" +#include "html.h" /* for strict_check */ #include "lexer.h" #include "mime.h" /* for mime_*() */ #include "msgcounts.h" @@ -62,10 +63,18 @@ #define YY_INPUT(buf,result,max_size) result = yyinput((byte *)buf, max_size) +#undef stderr +#define stderr dbgout /* for debug & -D options */ + static word_t yyt; -static word_t *yy_text(void); static int lineno; +/* Function Prototypes */ + +static word_t *yy_text(void); + +/* Function Definitions */ + static word_t *yy_text(void) { yyt.text = (byte *)yytext; @@ -76,7 +85,9 @@ %} %option debug nodebug -%option align nounput noyywrap noreject 8bit caseless +%option align caseless 8bit +%option never-interactive noreject +%option noyywrap %option prefix="lexer_v3_" UINT8 ([01]?[0-9]?[0-9]|2([0-4][0-9]|5[0-5])) @@ -102,10 +113,8 @@ TOKEN {TOKENFRONT}{TOKENMID_NJS_DR}{TOKENBACK} -T2 {TOKENFRONT}<>;=():&%$#@!+|/\\{}^\"\?\*,[:cntrl:]\[]+ -T1 {TOKENFRONT} - %s TEXT HTML BOGO_LEX +%s HTOKEN SCOMMENT LCOMMENT HSCRIPT %% @@ -142,7 +151,7 @@ { case MIME_TEXT_PLAIN: BEGIN(TEXT); break; case MIME_TEXT_HTML: BEGIN(HTML); break; - default: BEGIN(TEXT); break; + default: BEGIN(TEXT); break; } return(EMPTY); } @@ -155,13 +164,22 @@ return TOKEN; } } -{IPADDR} { return(IPADDR);} -{TOKEN} { return(TOKEN);} -<HTML>\<\!-- { html_comment(+1); } -<HTML>--> { html_comment(-1); } -<HTML>\< { html_tag(1); } -<HTML>\> { html_tag(0); } +<HTML>"<!--" { BEGIN SCOMMENT; } +<HTML>"<!" { if (!strict_check) { + BEGIN LCOMMENT; } else { BEGIN HTOKEN; }} + +<LCOMMENT>">" { BEGIN HTML; } /* end of loose comment; return to normal html processing */ +<SCOMMENT>"-->" { BEGIN HTML; } /* end of strict comment; return to normal html processing */ +<HTML>"<" { BEGIN HTOKEN; } +<HTOKEN>">" { BEGIN HTML; } /* end of tag; return to normal html processing */ + +<HTOKEN>{TOKEN} { if (tokenize_html_tags) return TOKEN; } +<SCOMMENT,LCOMMENT>{TOKEN} { if (tokenize_html_comments) return TOKEN; } +<HSCRIPT>{TOKEN} { if (tokenize_html_script) return TOKEN; } + +{IPADDR} { return IPADDR;} +{TOKEN} { return TOKEN;} . ; \n { got_newline(); @@ -170,6 +188,8 @@ %% +void *v = &yyunput; /* suppress compiler warning */ + void lexer_v3_init(FILE *fp) { lineno = 0; @@ -181,5 +201,6 @@ * The following sets edit modes for GNU EMACS * Local Variables: * mode:c + * indent-tabs-mode:t * End: */ |