Thread: [cvs] SF.net SVN: bogofilter:[6951] branches/avoid-rfc2047-pushback/bogofilter

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Revision: 6951
          http://bogofilter.svn.sourceforge.net/bogofilter/?rev=6951&view=rev
Author:   m-a
Date:     2011-05-08 02:19:14 +0000 (Sun, 08 May 2011)

Log Message:
-----------
Halfway fix for RFC2047-induced lexer pushback buffer overflow.

Modified Paths:
--------------
    branches/avoid-rfc2047-pushback/bogofilter/NEWS
    branches/avoid-rfc2047-pushback/bogofilter/src/lexer.c
    branches/avoid-rfc2047-pushback/bogofilter/src/lexer_v3.l

Modified: branches/avoid-rfc2047-pushback/bogofilter/NEWS
===================================================================

--- branches/avoid-rfc2047-pushback/bogofilter/NEWS	2011-05-08 02:13:31 UTC (rev 6950)
+++ branches/avoid-rfc2047-pushback/bogofilter/NEWS	2011-05-08 02:19:14 UTC (rev 6951)
@@ -15,6 +15,22 @@
 
 -------------------------------------------------------------------------------
 
+	2011-05-08
+
+	* Halfway fix for the RFC2047-related lexer pushback buffer overflow,
+	  reported to the bogofilter mailing list the day before.
+
+	  The fix is almost complete, resolves the pushback overflow in the
+	  MIME parser (see "spam" under evidence-vault), but there is one
+	  TODO: Unfortunately, the code runs the charset conversion twice, causing
+	  corruption of non-ASCII characters. This needs to be fixed.
+
+	  TODO: The other messages will probably still break the parser --
+	  we mustn't use pushback with flex (it's likely very inefficient,
+	  too).
+
+	2010-09-15
+
 	* Mark "Berkeley DB 5.1.19: (August 27, 2010)" supported.
 
 1.2.2	2010-07-08 (released)

Modified: branches/avoid-rfc2047-pushback/bogofilter/src/lexer.c
===================================================================
--- branches/avoid-rfc2047-pushback/bogofilter/src/lexer.c	2011-05-08 02:13:31 UTC (rev 6950)
+++ branches/avoid-rfc2047-pushback/bogofilter/src/lexer.c	2011-05-08 02:19:14 UTC (rev 6951)
@@ -149,9 +149,22 @@
     return count;
 }
 
+/*  RFC2047.2
+    encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
+    charset = token    ; see section 3
+    encoding = token   ; see section 4
+    token = 1*<Any CHAR except SPACE, CTLs, and especials>
+    especials = "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / "
+		<"> / "/" / "[" / "]" / "?" / "." / "="
+    encoded-text = 1*<Any printable ASCII character other than "?"
+		      or SPACE>
+		   ; (but see "Use of encoded-words in message
+		   ; headers", section 5)
+*/
 static int get_decoded_line(buff_t *buff)
 {
     int count;
+    int c;
     buff_t *linebuff;
 
 #ifdef	DISABLE_UNICODE
@@ -192,6 +205,26 @@
 	}
     }
 
+    if (msg_header) {
+	int oread = linebuff->read;
+
+	do {
+	    int add;
+
+	    /* in headers, peek at the next character to see if we need to fetch another
+	     * line to unfold headers */
+	    c = getc(fpin);
+	    if (c == EOF) break;
+	    ungetc(c, fpin);
+	    if (isblank(c)) {
+		add = yy_get_new_line(linebuff);
+		if (add >= 0) count += add; else break;
+	    }
+	} while (isblank(c));
+
+	linebuff->read = oread;
+    }
+
     /* Save the text on a linked list of lines.
      * Note that we store fixed-length blocks here, not lines.
      * One very long physical line could break up into more
@@ -200,6 +233,30 @@
     if (passthrough && count > 0)
 	textblock_add(linebuff->t.u.text+linebuff->read, (size_t) count);
 
+    if (msg_header) {
+	/* Try RFC-2047 decoder on everything */
+	word_t temp, *res;
+
+	temp.leng = (uint)count;
+	temp.u.text = linebuff->t.u.text+linebuff->read;
+
+	if (DEBUG_LEXER(2)) {
+	    fprintf(dbgout, "before: ");
+	    lexer_display_buffer(linebuff);
+	}
+	res = text_decode(&temp);
+	if (res != &temp) {
+	    memcpy(linebuff->t.u.text+linebuff->read,
+		    res->u.text, res->leng);
+	}
+	linebuff->t.leng -= (uint)(count - res->leng);
+	count = res->leng;
+	if (DEBUG_LEXER(2)) {
+	    fprintf(dbgout, "after:   ");
+	    lexer_display_buffer(linebuff);
+	}
+    }
+
     if ( !msg_header && 
 	 !msg_state->mime_dont_decode &&
 	 msg_state->mime_type != MIME_TYPE_UNKNOWN)
@@ -225,6 +282,7 @@
 	!msg_state->mime_dont_decode)
     {
 	iconvert(linebuff, buff);
+
 	/*
 	 * iconvert, treating multi-byte sequences, can shrink or enlarge
 	 * the output compared to its input.  Correct count.
@@ -401,7 +459,7 @@
     uint size = (uint) (txt - beg);				/* output offset */
 
 #ifndef	DISABLE_UNICODE
-    size_t max = w->leng * 4;
+    size_t max = w->leng * 6;
     static buff_t * buf = NULL;
 #endif
 
@@ -439,17 +497,29 @@
 
 	char *charset;
 
-	txt += 2;
+	if (txt[0] == '=' && txt[1] == '?') {
+	    txt += 2;
+	} else {
+	    len = fin - txt;
+	    memcpy(buf->t.u.text+size, txt, len);
+	    size += len;
+	    break;
+	}
+
+
 	typ = (byte *) memchr((char *)txt+1, '?', fin-txt);	/* Encoding type - 'B' or 'Q' */
-	*typ++ = '\0';						/* nul terminate */
+	if (!typ) break;
 
+	typ++;
 	charset = charset_as_string(txt, typ - txt - 1);
 
 	tmp = typ + 2;						/* start of encoded word */
 	end = (byte *) memstr((char *)tmp, fin-tmp, "?=");	/* last byte of encoded word  */
+	if (!end) break;
+
 	len = end - tmp;
 
-	w->u.text = tmp;				/* Start of encoded word */
+	w->u.text = tmp;			/* Start of encoded word */
 	w->leng = len;				/* Length of encoded word */
 	Z(w->u.text[w->leng]);			/* for easier debugging - removable */
 
@@ -497,6 +567,8 @@
 	    iconvert_cd(cd, &src, buf);
 	    iconv_close(cd);
 
+	    size = buf->t.leng;
+
 	    if (DEBUG_LEXER(3)) {
 		fputs("**4**  ", dbgout);
 		word_puts(&buf->t, 0, dbgout);
@@ -527,11 +599,12 @@
 
 	/* we have a next encoded word and we've had only whitespace
 	 * between the current and the next */
-	if (adjacent)
+	if (adjacent) {
 	    /* just skip whitespace */
 	    txt = end;
-	else
+	} else {
 	    /* copy everything that was between the encoded words */
+	    if (!end) end = fin;
 	    while (txt < end) {
 		if (encoding == E_RAW)
 		    beg[size++] = *txt++;
@@ -540,6 +613,7 @@
 		    buf->t.u.text[buf->t.leng++] = *txt++;
 #endif
 	    }
+	}
     }
 
     if (encoding == E_RAW) {

Modified: branches/avoid-rfc2047-pushback/bogofilter/src/lexer_v3.l
===================================================================
--- branches/avoid-rfc2047-pushback/bogofilter/src/lexer_v3.l	2011-05-08 02:13:31 UTC (rev 6950)
+++ branches/avoid-rfc2047-pushback/bogofilter/src/lexer_v3.l	2011-05-08 02:19:14 UTC (rev 6951)
@@ -161,38 +161,12 @@
 
 TOKEN		{FRONT_CHAR}({MID_CHAR}*{BACK_CHAR})?
 
-/*  RFC2047.2
-    encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
-    charset = token    ; see section 3
-    encoding = token   ; see section 4
-    token = 1*<Any CHAR except SPACE, CTLs, and especials>
-    especials = "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / "
-		<"> / "/" / "[" / "]" / "?" / "." / "="
-    encoded-text = 1*<Any printable ASCII character other than "?"
-		      or SPACE>
-		   ; (but see "Use of encoded-words in message
-		   ; headers", section 5)
-*/
-
-/* 09/01/03
-  Using "[^?]" in the pattern and validating the charset in 'C'
-  reduces executable size by approx 290k.
-  new: ENCODED_WORD =\?{CHARSET}\?[bq]\?[^?]*\?=
-  old: ENCODED_WORD =\?{CHARSET}\?(b\?{BASE64}\|q\?{QP})\?=
-
-  BASE64	[0-9a-zA-Z/+=]+
-  QP		[!->@-~]+
-*/
-
 WHITESPACE	[[:blank:]\n]
 NOTWHITESPACE	[^[:blank:]\n]
 
 HTML_ENCODING	"&#"x?[[:xdigit:]]+";"
 URL_ENCODING	"%"[[:xdigit:]][[:xdigit:]]
 
-ENCODED_WORD	=\?{CHARSET}\?[bq]\?[^?\n]*\?=
-ENCODED_TOKEN	({FRONT_CHAR}{MID_CHAR}*)?({ENCODED_WORD}{WHITESPACE}+)*{ENCODED_WORD}
-
 /*
 HTML_WI_COMMENTS	"<"[^>]*">"
 HTML_WO_COMMENTS	"<"[^!][^>]*">"\|"<>"
@@ -232,11 +206,6 @@
 <BOGO_LEX>^\"{BOGOLEX_CHAR}+\"{NUM_NUM}$	{ return BOGO_LEX_LINE; }
 <BOGO_LEX>\n					{ lineno += 1; }
 
-<INITIAL>{ENCODED_TOKEN}			{ word_t *raw = yy_text();
-						  word_t *txt = text_decode(raw);
-						  yy_unput(txt->u.text, txt->leng);
-						}
-
 <INITIAL>^(To|CC|From|Return-Path|Subject|Received):	{ set_tag(yytext); }
 <INITIAL>^Content-(Transfer-Encoding|Type|Disposition):{MTYPE}	{ mime_content(yy_text()); skip_to(':'); header(); return TOKEN; }
 
@@ -439,6 +408,8 @@
 
 static void yy_unput(const byte *txt, uint len)
 {
+    if (DEBUG_LEXER(3))
+	fprintf(dbgout, "Ungetting %u bytes.\n", len);
     while (len-- > 0)
 	yyunput(txt[len], yytext);
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




Thread: [cvs] SF.net SVN: bogofilter:[6951] branches/avoid-rfc2047-pushback/bogofilter

Fast Bayesian spam filter along lines suggested by Paul Graham

bogofilter-cvs