Thread: [cvs] SF.net SVN: bogofilter:[6951] branches/avoid-rfc2047-pushback/bogofilter
Fast Bayesian spam filter along lines suggested by Paul Graham
Brought to you by:
m-a
From: <m-...@us...> - 2011-05-08 02:19:20
|
Revision: 6951 http://bogofilter.svn.sourceforge.net/bogofilter/?rev=6951&view=rev Author: m-a Date: 2011-05-08 02:19:14 +0000 (Sun, 08 May 2011) Log Message: ----------- Halfway fix for RFC2047-induced lexer pushback buffer overflow. Modified Paths: -------------- branches/avoid-rfc2047-pushback/bogofilter/NEWS branches/avoid-rfc2047-pushback/bogofilter/src/lexer.c branches/avoid-rfc2047-pushback/bogofilter/src/lexer_v3.l Modified: branches/avoid-rfc2047-pushback/bogofilter/NEWS =================================================================== --- branches/avoid-rfc2047-pushback/bogofilter/NEWS 2011-05-08 02:13:31 UTC (rev 6950) +++ branches/avoid-rfc2047-pushback/bogofilter/NEWS 2011-05-08 02:19:14 UTC (rev 6951) @@ -15,6 +15,22 @@ ------------------------------------------------------------------------------- + 2011-05-08 + + * Halfway fix for the RFC2047-related lexer pushback buffer overflow, + reported to the bogofilter mailing list the day before. + + The fix is almost complete, resolves the pushback overflow in the + MIME parser (see "spam" under evidence-vault), but there is one + TODO: Unfortunately, the code runs the charset conversion twice, causing + corruption of non-ASCII characters. This needs to be fixed. + + TODO: The other messages will probably still break the parser -- + we mustn't use pushback with flex (it's likely very inefficient, + too). + + 2010-09-15 + * Mark "Berkeley DB 5.1.19: (August 27, 2010)" supported. 1.2.2 2010-07-08 (released) Modified: branches/avoid-rfc2047-pushback/bogofilter/src/lexer.c =================================================================== --- branches/avoid-rfc2047-pushback/bogofilter/src/lexer.c 2011-05-08 02:13:31 UTC (rev 6950) +++ branches/avoid-rfc2047-pushback/bogofilter/src/lexer.c 2011-05-08 02:19:14 UTC (rev 6951) @@ -149,9 +149,22 @@ return count; } +/* RFC2047.2 + encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" + charset = token ; see section 3 + encoding = token ; see section 4 + token = 1*<Any CHAR except SPACE, CTLs, and especials> + especials = "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / " + <"> / "/" / "[" / "]" / "?" / "." / "=" + encoded-text = 1*<Any printable ASCII character other than "?" + or SPACE> + ; (but see "Use of encoded-words in message + ; headers", section 5) +*/ static int get_decoded_line(buff_t *buff) { int count; + int c; buff_t *linebuff; #ifdef DISABLE_UNICODE @@ -192,6 +205,26 @@ } } + if (msg_header) { + int oread = linebuff->read; + + do { + int add; + + /* in headers, peek at the next character to see if we need to fetch another + * line to unfold headers */ + c = getc(fpin); + if (c == EOF) break; + ungetc(c, fpin); + if (isblank(c)) { + add = yy_get_new_line(linebuff); + if (add >= 0) count += add; else break; + } + } while (isblank(c)); + + linebuff->read = oread; + } + /* Save the text on a linked list of lines. * Note that we store fixed-length blocks here, not lines. * One very long physical line could break up into more @@ -200,6 +233,30 @@ if (passthrough && count > 0) textblock_add(linebuff->t.u.text+linebuff->read, (size_t) count); + if (msg_header) { + /* Try RFC-2047 decoder on everything */ + word_t temp, *res; + + temp.leng = (uint)count; + temp.u.text = linebuff->t.u.text+linebuff->read; + + if (DEBUG_LEXER(2)) { + fprintf(dbgout, "before: "); + lexer_display_buffer(linebuff); + } + res = text_decode(&temp); + if (res != &temp) { + memcpy(linebuff->t.u.text+linebuff->read, + res->u.text, res->leng); + } + linebuff->t.leng -= (uint)(count - res->leng); + count = res->leng; + if (DEBUG_LEXER(2)) { + fprintf(dbgout, "after: "); + lexer_display_buffer(linebuff); + } + } + if ( !msg_header && !msg_state->mime_dont_decode && msg_state->mime_type != MIME_TYPE_UNKNOWN) @@ -225,6 +282,7 @@ !msg_state->mime_dont_decode) { iconvert(linebuff, buff); + /* * iconvert, treating multi-byte sequences, can shrink or enlarge * the output compared to its input. Correct count. @@ -401,7 +459,7 @@ uint size = (uint) (txt - beg); /* output offset */ #ifndef DISABLE_UNICODE - size_t max = w->leng * 4; + size_t max = w->leng * 6; static buff_t * buf = NULL; #endif @@ -439,17 +497,29 @@ char *charset; - txt += 2; + if (txt[0] == '=' && txt[1] == '?') { + txt += 2; + } else { + len = fin - txt; + memcpy(buf->t.u.text+size, txt, len); + size += len; + break; + } + + typ = (byte *) memchr((char *)txt+1, '?', fin-txt); /* Encoding type - 'B' or 'Q' */ - *typ++ = '\0'; /* nul terminate */ + if (!typ) break; + typ++; charset = charset_as_string(txt, typ - txt - 1); tmp = typ + 2; /* start of encoded word */ end = (byte *) memstr((char *)tmp, fin-tmp, "?="); /* last byte of encoded word */ + if (!end) break; + len = end - tmp; - w->u.text = tmp; /* Start of encoded word */ + w->u.text = tmp; /* Start of encoded word */ w->leng = len; /* Length of encoded word */ Z(w->u.text[w->leng]); /* for easier debugging - removable */ @@ -497,6 +567,8 @@ iconvert_cd(cd, &src, buf); iconv_close(cd); + size = buf->t.leng; + if (DEBUG_LEXER(3)) { fputs("**4** ", dbgout); word_puts(&buf->t, 0, dbgout); @@ -527,11 +599,12 @@ /* we have a next encoded word and we've had only whitespace * between the current and the next */ - if (adjacent) + if (adjacent) { /* just skip whitespace */ txt = end; - else + } else { /* copy everything that was between the encoded words */ + if (!end) end = fin; while (txt < end) { if (encoding == E_RAW) beg[size++] = *txt++; @@ -540,6 +613,7 @@ buf->t.u.text[buf->t.leng++] = *txt++; #endif } + } } if (encoding == E_RAW) { Modified: branches/avoid-rfc2047-pushback/bogofilter/src/lexer_v3.l =================================================================== --- branches/avoid-rfc2047-pushback/bogofilter/src/lexer_v3.l 2011-05-08 02:13:31 UTC (rev 6950) +++ branches/avoid-rfc2047-pushback/bogofilter/src/lexer_v3.l 2011-05-08 02:19:14 UTC (rev 6951) @@ -161,38 +161,12 @@ TOKEN {FRONT_CHAR}({MID_CHAR}*{BACK_CHAR})? -/* RFC2047.2 - encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" - charset = token ; see section 3 - encoding = token ; see section 4 - token = 1*<Any CHAR except SPACE, CTLs, and especials> - especials = "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / " - <"> / "/" / "[" / "]" / "?" / "." / "=" - encoded-text = 1*<Any printable ASCII character other than "?" - or SPACE> - ; (but see "Use of encoded-words in message - ; headers", section 5) -*/ - -/* 09/01/03 - Using "[^?]" in the pattern and validating the charset in 'C' - reduces executable size by approx 290k. - new: ENCODED_WORD =\?{CHARSET}\?[bq]\?[^?]*\?= - old: ENCODED_WORD =\?{CHARSET}\?(b\?{BASE64}\|q\?{QP})\?= - - BASE64 [0-9a-zA-Z/+=]+ - QP [!->@-~]+ -*/ - WHITESPACE [[:blank:]\n] NOTWHITESPACE [^[:blank:]\n] HTML_ENCODING "&#"x?[[:xdigit:]]+";" URL_ENCODING "%"[[:xdigit:]][[:xdigit:]] -ENCODED_WORD =\?{CHARSET}\?[bq]\?[^?\n]*\?= -ENCODED_TOKEN ({FRONT_CHAR}{MID_CHAR}*)?({ENCODED_WORD}{WHITESPACE}+)*{ENCODED_WORD} - /* HTML_WI_COMMENTS "<"[^>]*">" HTML_WO_COMMENTS "<"[^!][^>]*">"\|"<>" @@ -232,11 +206,6 @@ <BOGO_LEX>^\"{BOGOLEX_CHAR}+\"{NUM_NUM}$ { return BOGO_LEX_LINE; } <BOGO_LEX>\n { lineno += 1; } -<INITIAL>{ENCODED_TOKEN} { word_t *raw = yy_text(); - word_t *txt = text_decode(raw); - yy_unput(txt->u.text, txt->leng); - } - <INITIAL>^(To|CC|From|Return-Path|Subject|Received): { set_tag(yytext); } <INITIAL>^Content-(Transfer-Encoding|Type|Disposition):{MTYPE} { mime_content(yy_text()); skip_to(':'); header(); return TOKEN; } @@ -439,6 +408,8 @@ static void yy_unput(const byte *txt, uint len) { + if (DEBUG_LEXER(3)) + fprintf(dbgout, "Ungetting %u bytes.\n", len); while (len-- > 0) yyunput(txt[len], yytext); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |