From: <jgr...@us...> - 2003-05-05 17:50:16
|
Update of /cvsroot/popfile/engine/Classifier In directory sc8-pr-cvs1:/tmp/cvs-serv9618/Classifier Modified Files: MailParse.pm Log Message: Numerous updates to the mail parser to make colorization work for pseudowords, and fix various bugs Index: MailParse.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v retrieving revision 1.123 retrieving revision 1.124 diff -C2 -d -r1.123 -r1.124 *** MailParse.pm 30 Apr 2003 03:13:36 -0000 1.123 --- MailParse.pm 5 May 2003 17:50:10 -0000 1.124 *************** *** 70,73 **** --- 70,74 ---- $self->{htmlbackcolor__} = map_color( $self, 'white' ); + $self->{htmlbodycolor__} = map_color( $self, 'white' ); $self->{htmlfontcolor__} = map_color( $self, 'black' ); *************** *** 147,150 **** --- 148,153 ---- # $prefix The pseudoword prefix (e.g. header) # $word The pseudoword (e.g. Mime-Version) + # $encoded Whether this was found inside encoded text + # $literal The literal text that generated this pseudoword # # --------------------------------------------------------------------------------------------- *************** *** 152,158 **** sub update_pseudoword { ! my ( $self, $prefix, $word ) = @_; ! $self->increment_word( "$prefix:$word" ); } --- 155,175 ---- sub update_pseudoword { ! my ( $self, $prefix, $word, $encoded, $literal ) = @_; ! my $mword = "$prefix:$word"; ! ! if ( $self->{color__} ) { ! $literal =~ s/</</g; ! $literal =~ s/>/>/g; ! my $color = $self->{bayes__}->get_color($mword); ! my $to = "<b><font color=\"$color\"><a title=\"$mword\">$literal</a></font></b>"; ! if ( $encoded == 0 ) { ! $self->{ut__} =~ s/\Q$literal\E/$to/g; ! } else { ! $self->{ut__} .= $to . ' '; ! } ! } else { ! $self->increment_word( $mword ); ! } } *************** *** 194,198 **** $self->{ut__} .= "<font color=\"$color\">$word<\/font> "; } - } else { increment_word( $self, $mword ); --- 211,214 ---- *************** *** 259,263 **** $self->{ut__} =~ s/$from/$to/g; print "$from -> $to\n" if $self->{debug}; ! increment_word( $self, 'html:numericentity' ); } } --- 275,279 ---- $self->{ut__} =~ s/$from/$to/g; print "$from -> $to\n" if $self->{debug}; ! $self->update_pseudoword( 'html', 'numericentity', $encoded, $from ); } } *************** *** 283,297 **** # Deal with runs of alternating spaces and letters - # TODO: find a way to make this (and other similar stuff) highlight - # without using the encoded content printer or modifying $self->{ut__} ! foreach my $space (' ', '\'', '*', '^', '`', ' ', '\38' ){ ! while ( $line =~ s/( |^)(([A-Z]\Q$space\E){2,15}[A-Z])( |\Q$space\E|[!\?])/ /i ) { my $word = $2; print "$word ->" if $self->{debug}; ! $word =~ s/\Q$space\E//g; print "$word\n" if $self->{debug}; ! update_word( $self, $word, $encoded, ' ', ' ', $prefix); ! increment_word( $self, 'trick:spacedout' ); } } --- 299,312 ---- # Deal with runs of alternating spaces and letters ! foreach my $space (' ', '\'', '*', '^', '`', ' ', '\38', '.' ){ ! while ( $line =~ s/( |^)(([A-Z]\Q$space\E){2,15}[A-Z])( |\Q$space\E|[!\?,])/ /i ) { ! my $original = "$1$2$4"; my $word = $2; print "$word ->" if $self->{debug}; ! $word =~ s/[^A-Z]//gi; print "$word\n" if $self->{debug}; ! $self->update_word( $word, $encoded, ' ', ' ', $prefix); ! $self->update_pseudoword( 'trick', 'spacedout', $encoded, $original ); } } *************** *** 300,304 **** while ( $line =~ s/ ([A-Z]+)\.([A-Z]{2,}) / $1$2 /i ) { ! increment_word( $self, 'trick:dottedwords' ); } --- 315,319 ---- while ( $line =~ s/ ([A-Z]+)\.([A-Z]{2,}) / $1$2 /i ) { ! $self->update_pseudoword( 'trick', 'dottedwords', $encoded, "$1$2" ); } *************** *** 320,324 **** } else { if ( $bigline ne '' ) { ! $self->increment_word( 'trick:invisibleink' ); } } --- 335,339 ---- } else { if ( $bigline ne '' ) { ! $self->update_pseudoword( 'trick', 'invisibleink', $encoded, $bigline ); } } *************** *** 355,370 **** } - return; - } - # If we hit a table tag then any font information is lost if ( $tag =~ /^(table|td|tr|th)$/i ) { ! $self->{htmlfontcolor__} = map_color( $self, 'black' ); ! $self->{htmlbackcolor__} = map_color( $self, 'white' ); } ! # Count the number of TD elements ! increment_word( $self, 'html:td' ) if ( $tag =~ /^td$/i ); my $attribute; --- 370,385 ---- } # If we hit a table tag then any font information is lost if ( $tag =~ /^(table|td|tr|th)$/i ) { ! $self->{htmlfontcolor__} = map_color( $self, 'black' ); ! $self->{htmlbackcolor__} = $self->{htmlbodycolor__}; } ! return; ! } ! ! # Count the number of TD elements ! $self->update_pseudoword('html', 'td', $encoded, $tag ) if ( $tag =~ /^td$/i ); my $attribute; *************** *** 381,392 **** # match a space or > or EOL ! while ( $arg =~ s/[ \t]*(\w+)[ \t]*=[ \t]*([\"\'])?(.*?)(?(2)\2|($|([ \t>])))//i ) { ! $attribute = $1; ! $value = $3; $quote = ''; $end_quote = '[\> \t\&\n]'; ! if (defined $2) { ! $quote = $2; ! $end_quote = $2; } --- 396,410 ---- # match a space or > or EOL ! my $original; ! ! while ( $arg =~ s/[ \t]*((\w+)[ \t]*=[ \t]*([\"\'])?(.*?)(\3|($|([ \t>]))))//i ) { ! $original = $1; ! $attribute = $2; ! $value = $4; $quote = ''; $end_quote = '[\> \t\&\n]'; ! if (defined $3) { ! $quote = $3; ! $end_quote = $3; } *************** *** 410,414 **** ( ( $tag =~ /^img|frame|iframe$/i ) || ( $tag =~ /^script$/i && $parse_script_uri ) ) ) { ! # "CID:" links refer to an origin-controlled attachment to a html email. # Adding strings from these, even if they appear to be hostnames, may or --- 428,432 ---- ( ( $tag =~ /^img|frame|iframe$/i ) || ( $tag =~ /^script$/i && $parse_script_uri ) ) ) { ! # "CID:" links refer to an origin-controlled attachment to a html email. # Adding strings from these, even if they appear to be hostnames, may or *************** *** 418,425 **** { # TODO: Decide what to do here, ignoring CID's for now - } else { ! my $host = add_url( $self, $value, $encoded, $quote, $end_quote, '' ); # If the host name is not blank (i.e. there was a hostname in the url --- 436,442 ---- { # TODO: Decide what to do here, ignoring CID's for now } else { ! my $host = add_url( $self, $value, $encoded, $quote, $end_quote, '', 1 ); # If the host name is not blank (i.e. there was a hostname in the url *************** *** 428,438 **** if ( ( $host ne '' ) && ( $tag =~ /^img$/i ) ) { ! if ( $host ne 'localhost' ) { ! $self->update_pseudoword( 'html', 'imgremotesrc' ); ! } } } - next; } --- 445,455 ---- if ( ( $host ne '' ) && ( $tag =~ /^img$/i ) ) { ! if ( $host ne 'localhost' ) { ! $self->update_pseudoword( 'html', 'imgremotesrc', $encoded, $original ); ! } } } + add_url( $self, $value, $encoded, $quote, $end_quote, '' ); next; } *************** *** 499,503 **** if ( ( $attribute =~ /^(width|height)$/i ) && ( $tag =~ /^img$/i ) ) { $attribute = lc( $attribute ); ! $self->update_pseudoword( 'html', "img$attribute$value" ); } --- 516,520 ---- if ( ( $attribute =~ /^(width|height)$/i ) && ( $tag =~ /^img$/i ) ) { $attribute = lc( $attribute ); ! $self->update_pseudoword( 'html', "img$attribute$value", $encoded, $original ); } *************** *** 505,509 **** if ( ( $attribute =~ /^size$/i ) && ( $tag =~ /^font$/i ) ) { ! $self->update_pseudoword( 'html', "fontsize$value" ); } --- 522,526 ---- if ( ( $attribute =~ /^size$/i ) && ( $tag =~ /^font$/i ) ) { ! $self->update_pseudoword( 'html', "fontsize$value", $encoded, $original ); } *************** *** 514,517 **** --- 531,536 ---- $self->{htmlbackcolor__} = map_color($self, $value); print "Set html back color to $self->{htmlbackcolor__}\n" if ( $self->{debug} ); + + $self->{htmlbodycolor__} = $self->{htmlbackcolor__} if ( $tag =~ /^body$/i ); } *************** *** 564,567 **** --- 583,587 ---- # $prefix A string to prefix any words with in the corpus, used for the special # identification of values found in for example the subject line + # $noadd If defined indicates that only parsing should be done, no word updates # # Returns the hostname *************** *** 570,574 **** sub add_url { ! my ($self, $url, $encoded, $before, $after, $prefix) = @_; my $temp_url = $url; --- 590,594 ---- sub add_url { ! my ($self, $url, $encoded, $before, $after, $prefix, $noadd) = @_; my $temp_url = $url; *************** *** 592,600 **** # Remove any URL encoding (protocol may not be URL encoded) ! if ( $url =~ s/(\%([0-9A-Fa-f][0-9A-Fa-f]))/chr(hex("0x$2"))/eg ) { ! increment_word( $self, "html:encodedurl" ); ! my $new_url = (defined $protocol?"$protocol://":'') . $url; ! print "$temp_url -> " . $new_url . "\n" if $self->{debug}; ! $self->{ut__} =~ s/$temp_url/$new_url/e if $self->{color__}; } --- 612,620 ---- # Remove any URL encoding (protocol may not be URL encoded) ! my $oldurl = $url; ! my $percents = ( $url =~ s/(%([0-9A-Fa-f]{2}))/chr(hex("0x$2"))/ge ); ! ! if ( $percents > 0 ) { ! $self->update_pseudoword( 'html', 'encodedurl', $encoded, $oldurl ) if ( !defined( $noadd ) ); } *************** *** 689,693 **** $temp_after = "[\:]" if (defined $port); ! update_word( $self, $host, $encoded, $temp_before, $temp_after, $prefix); # decided not to care about tld's beyond the verification performed when --- 709,713 ---- $temp_after = "[\:]" if (defined $port); ! update_word( $self, $host, $encoded, $temp_before, $temp_after, $prefix) if ( !defined( $noadd ) ); # decided not to care about tld's beyond the verification performed when *************** *** 698,702 **** if ( $hostform eq "name" ) { while ( $host =~ s/^([^\.])+\.(.*\.(.*))$/$2/ ) { ! update_word( $self, $2, $encoded, '[\.]', '[<]', $prefix); } } --- 718,722 ---- if ( $hostform eq "name" ) { while ( $host =~ s/^([^\.])+\.(.*\.(.*))$/$2/ ) { ! update_word( $self, $2, $encoded, '[\.]', '[<]', $prefix) if ( !defined( $noadd ) ); } } *************** *** 730,735 **** # Remove HTML comments and other tags that begin ! ! while ( $line =~ s/<!.*?>// ) { ! increment_word( $self, 'html:comment' ); print "$line\n" if $self->{debug}; } --- 750,755 ---- # Remove HTML comments and other tags that begin ! ! while ( $line =~ s/(<!.*?>)// ) { ! $self->update_pseudoword( 'html', 'comment', $encoded, $1 ); print "$line\n" if $self->{debug}; } *************** *** 1157,1163 **** if ($self->{color__}) { # Remove over-reading ! $self->{ut__} = ''; ! ! # Qeueue just this header for colorization $self->{ut__} = splitline("$header: $argument\015\012", $encoding); } --- 1177,1183 ---- if ($self->{color__}) { # Remove over-reading ! $self->{ut__} = ''; ! ! # Qeueue just this header for colorization $self->{ut__} = splitline("$header: $argument\015\012", $encoding); } *************** *** 1168,1175 **** # much spam uses MIME-Version, MiME-Version and Mime-Version ! $self->update_pseudoword( 'header', $header ); # Check the encoding type in all RFC 2047 encoded headers ! if ( $argument =~ /=\?(.{1,40})\?(Q|B)/i ) { update_word( $self, $1, 0, '', '', 'charset' ); --- 1188,1195 ---- # much spam uses MIME-Version, MiME-Version and Mime-Version ! $self->update_pseudoword( 'header', $header, 0, $header ); # Check the encoding type in all RFC 2047 encoded headers ! if ( $argument =~ /=\?(.{1,40})\?(Q|B)/i ) { update_word( $self, $1, 0, '', '', 'charset' ); *************** *** 1178,1182 **** # Handle the From, To and Cc headers and extract email addresses # from them and treat them as words - # For certain headers we are going to mark them specially in the corpus --- 1198,1201 ---- *************** *** 1282,1286 **** my $compact_encoding = $encoding; $compact_encoding =~ s/[^A-Za-z0-9]//g; ! increment_word( $self, "encoding:$compact_encoding" ); return ($mime, $encoding); } --- 1301,1305 ---- my $compact_encoding = $encoding; $compact_encoding =~ s/[^A-Za-z0-9]//g; ! $self->update_pseudoword( 'encoding', $compact_encoding, 0, $encoding ); return ($mime, $encoding); } *************** *** 1289,1297 **** return ($mime, $encoding) if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)$/i ); ! # Some headers should never be RFC 2047 decoded ! $argument = $self->decode_string($argument) unless ($header =~ /^(Revceived|Content\-Type|Content\-Disposition)$/i); ! add_line( $self, $argument, 0, $prefix ); --- 1308,1316 ---- return ($mime, $encoding) if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)$/i ); ! # Some headers should never be RFC 2047 decoded ! $argument = $self->decode_string($argument) unless ($header =~ /^(Revceived|Content\-Type|Content\-Disposition)$/i); ! add_line( $self, $argument, 0, $prefix ); *************** *** 1313,1317 **** my ($line, $encoding) = @_; $line =~ s/([^\r\n]{100,120} )/$1\r\n/g; ! $line =~ s/([^ \r\n]{120})/$1\r\n/g; $line =~ s/</</g; --- 1332,1336 ---- my ($line, $encoding) = @_; $line =~ s/([^\r\n]{100,120} )/$1\r\n/g; ! $line =~ s/([^ \r\n]{120})/$1\r\n/g; $line =~ s/</</g; *************** *** 1324,1329 **** $line =~ s/\t/ /g; ! ! return $line; } --- 1343,1348 ---- $line =~ s/\t/ /g; ! ! return $line; } |