|
From: <jgr...@us...> - 2003-05-05 17:50:16
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv9618/Classifier
Modified Files:
MailParse.pm
Log Message:
Numerous updates to the mail parser to make colorization work for pseudowords, and fix various bugs
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.123
retrieving revision 1.124
diff -C2 -d -r1.123 -r1.124
*** MailParse.pm 30 Apr 2003 03:13:36 -0000 1.123
--- MailParse.pm 5 May 2003 17:50:10 -0000 1.124
***************
*** 70,73 ****
--- 70,74 ----
$self->{htmlbackcolor__} = map_color( $self, 'white' );
+ $self->{htmlbodycolor__} = map_color( $self, 'white' );
$self->{htmlfontcolor__} = map_color( $self, 'black' );
***************
*** 147,150 ****
--- 148,153 ----
# $prefix The pseudoword prefix (e.g. header)
# $word The pseudoword (e.g. Mime-Version)
+ # $encoded Whether this was found inside encoded text
+ # $literal The literal text that generated this pseudoword
#
# ---------------------------------------------------------------------------------------------
***************
*** 152,158 ****
sub update_pseudoword
{
! my ( $self, $prefix, $word ) = @_;
! $self->increment_word( "$prefix:$word" );
}
--- 155,175 ----
sub update_pseudoword
{
! my ( $self, $prefix, $word, $encoded, $literal ) = @_;
! my $mword = "$prefix:$word";
!
! if ( $self->{color__} ) {
! $literal =~ s/</</g;
! $literal =~ s/>/>/g;
! my $color = $self->{bayes__}->get_color($mword);
! my $to = "<b><font color=\"$color\"><a title=\"$mword\">$literal</a></font></b>";
! if ( $encoded == 0 ) {
! $self->{ut__} =~ s/\Q$literal\E/$to/g;
! } else {
! $self->{ut__} .= $to . ' ';
! }
! } else {
! $self->increment_word( $mword );
! }
}
***************
*** 194,198 ****
$self->{ut__} .= "<font color=\"$color\">$word<\/font> ";
}
-
} else {
increment_word( $self, $mword );
--- 211,214 ----
***************
*** 259,263 ****
$self->{ut__} =~ s/$from/$to/g;
print "$from -> $to\n" if $self->{debug};
! increment_word( $self, 'html:numericentity' );
}
}
--- 275,279 ----
$self->{ut__} =~ s/$from/$to/g;
print "$from -> $to\n" if $self->{debug};
! $self->update_pseudoword( 'html', 'numericentity', $encoded, $from );
}
}
***************
*** 283,297 ****
# Deal with runs of alternating spaces and letters
- # TODO: find a way to make this (and other similar stuff) highlight
- # without using the encoded content printer or modifying $self->{ut__}
! foreach my $space (' ', '\'', '*', '^', '`', ' ', '\38' ){
! while ( $line =~ s/( |^)(([A-Z]\Q$space\E){2,15}[A-Z])( |\Q$space\E|[!\?])/ /i ) {
my $word = $2;
print "$word ->" if $self->{debug};
! $word =~ s/\Q$space\E//g;
print "$word\n" if $self->{debug};
! update_word( $self, $word, $encoded, ' ', ' ', $prefix);
! increment_word( $self, 'trick:spacedout' );
}
}
--- 299,312 ----
# Deal with runs of alternating spaces and letters
! foreach my $space (' ', '\'', '*', '^', '`', ' ', '\38', '.' ){
! while ( $line =~ s/( |^)(([A-Z]\Q$space\E){2,15}[A-Z])( |\Q$space\E|[!\?,])/ /i ) {
! my $original = "$1$2$4";
my $word = $2;
print "$word ->" if $self->{debug};
! $word =~ s/[^A-Z]//gi;
print "$word\n" if $self->{debug};
! $self->update_word( $word, $encoded, ' ', ' ', $prefix);
! $self->update_pseudoword( 'trick', 'spacedout', $encoded, $original );
}
}
***************
*** 300,304 ****
while ( $line =~ s/ ([A-Z]+)\.([A-Z]{2,}) / $1$2 /i ) {
! increment_word( $self, 'trick:dottedwords' );
}
--- 315,319 ----
while ( $line =~ s/ ([A-Z]+)\.([A-Z]{2,}) / $1$2 /i ) {
! $self->update_pseudoword( 'trick', 'dottedwords', $encoded, "$1$2" );
}
***************
*** 320,324 ****
} else {
if ( $bigline ne '' ) {
! $self->increment_word( 'trick:invisibleink' );
}
}
--- 335,339 ----
} else {
if ( $bigline ne '' ) {
! $self->update_pseudoword( 'trick', 'invisibleink', $encoded, $bigline );
}
}
***************
*** 355,370 ****
}
- return;
- }
-
# If we hit a table tag then any font information is lost
if ( $tag =~ /^(table|td|tr|th)$/i ) {
! $self->{htmlfontcolor__} = map_color( $self, 'black' );
! $self->{htmlbackcolor__} = map_color( $self, 'white' );
}
! # Count the number of TD elements
! increment_word( $self, 'html:td' ) if ( $tag =~ /^td$/i );
my $attribute;
--- 370,385 ----
}
# If we hit a table tag then any font information is lost
if ( $tag =~ /^(table|td|tr|th)$/i ) {
! $self->{htmlfontcolor__} = map_color( $self, 'black' );
! $self->{htmlbackcolor__} = $self->{htmlbodycolor__};
}
! return;
! }
!
! # Count the number of TD elements
! $self->update_pseudoword('html', 'td', $encoded, $tag ) if ( $tag =~ /^td$/i );
my $attribute;
***************
*** 381,392 ****
# match a space or > or EOL
! while ( $arg =~ s/[ \t]*(\w+)[ \t]*=[ \t]*([\"\'])?(.*?)(?(2)\2|($|([ \t>])))//i ) {
! $attribute = $1;
! $value = $3;
$quote = '';
$end_quote = '[\> \t\&\n]';
! if (defined $2) {
! $quote = $2;
! $end_quote = $2;
}
--- 396,410 ----
# match a space or > or EOL
! my $original;
!
! while ( $arg =~ s/[ \t]*((\w+)[ \t]*=[ \t]*([\"\'])?(.*?)(\3|($|([ \t>]))))//i ) {
! $original = $1;
! $attribute = $2;
! $value = $4;
$quote = '';
$end_quote = '[\> \t\&\n]';
! if (defined $3) {
! $quote = $3;
! $end_quote = $3;
}
***************
*** 410,414 ****
( ( $tag =~ /^img|frame|iframe$/i )
|| ( $tag =~ /^script$/i && $parse_script_uri ) ) ) {
!
# "CID:" links refer to an origin-controlled attachment to a html email.
# Adding strings from these, even if they appear to be hostnames, may or
--- 428,432 ----
( ( $tag =~ /^img|frame|iframe$/i )
|| ( $tag =~ /^script$/i && $parse_script_uri ) ) ) {
!
# "CID:" links refer to an origin-controlled attachment to a html email.
# Adding strings from these, even if they appear to be hostnames, may or
***************
*** 418,425 ****
{
# TODO: Decide what to do here, ignoring CID's for now
-
} else {
! my $host = add_url( $self, $value, $encoded, $quote, $end_quote, '' );
# If the host name is not blank (i.e. there was a hostname in the url
--- 436,442 ----
{
# TODO: Decide what to do here, ignoring CID's for now
} else {
! my $host = add_url( $self, $value, $encoded, $quote, $end_quote, '', 1 );
# If the host name is not blank (i.e. there was a hostname in the url
***************
*** 428,438 ****
if ( ( $host ne '' ) && ( $tag =~ /^img$/i ) ) {
! if ( $host ne 'localhost' ) {
! $self->update_pseudoword( 'html', 'imgremotesrc' );
! }
}
}
-
next;
}
--- 445,455 ----
if ( ( $host ne '' ) && ( $tag =~ /^img$/i ) ) {
! if ( $host ne 'localhost' ) {
! $self->update_pseudoword( 'html', 'imgremotesrc', $encoded, $original );
! }
}
}
+ add_url( $self, $value, $encoded, $quote, $end_quote, '' );
next;
}
***************
*** 499,503 ****
if ( ( $attribute =~ /^(width|height)$/i ) && ( $tag =~ /^img$/i ) ) {
$attribute = lc( $attribute );
! $self->update_pseudoword( 'html', "img$attribute$value" );
}
--- 516,520 ----
if ( ( $attribute =~ /^(width|height)$/i ) && ( $tag =~ /^img$/i ) ) {
$attribute = lc( $attribute );
! $self->update_pseudoword( 'html', "img$attribute$value", $encoded, $original );
}
***************
*** 505,509 ****
if ( ( $attribute =~ /^size$/i ) && ( $tag =~ /^font$/i ) ) {
! $self->update_pseudoword( 'html', "fontsize$value" );
}
--- 522,526 ----
if ( ( $attribute =~ /^size$/i ) && ( $tag =~ /^font$/i ) ) {
! $self->update_pseudoword( 'html', "fontsize$value", $encoded, $original );
}
***************
*** 514,517 ****
--- 531,536 ----
$self->{htmlbackcolor__} = map_color($self, $value);
print "Set html back color to $self->{htmlbackcolor__}\n" if ( $self->{debug} );
+
+ $self->{htmlbodycolor__} = $self->{htmlbackcolor__} if ( $tag =~ /^body$/i );
}
***************
*** 564,567 ****
--- 583,587 ----
# $prefix A string to prefix any words with in the corpus, used for the special
# identification of values found in for example the subject line
+ # $noadd If defined indicates that only parsing should be done, no word updates
#
# Returns the hostname
***************
*** 570,574 ****
sub add_url
{
! my ($self, $url, $encoded, $before, $after, $prefix) = @_;
my $temp_url = $url;
--- 590,594 ----
sub add_url
{
! my ($self, $url, $encoded, $before, $after, $prefix, $noadd) = @_;
my $temp_url = $url;
***************
*** 592,600 ****
# Remove any URL encoding (protocol may not be URL encoded)
! if ( $url =~ s/(\%([0-9A-Fa-f][0-9A-Fa-f]))/chr(hex("0x$2"))/eg ) {
! increment_word( $self, "html:encodedurl" );
! my $new_url = (defined $protocol?"$protocol://":'') . $url;
! print "$temp_url -> " . $new_url . "\n" if $self->{debug};
! $self->{ut__} =~ s/$temp_url/$new_url/e if $self->{color__};
}
--- 612,620 ----
# Remove any URL encoding (protocol may not be URL encoded)
! my $oldurl = $url;
! my $percents = ( $url =~ s/(%([0-9A-Fa-f]{2}))/chr(hex("0x$2"))/ge );
!
! if ( $percents > 0 ) {
! $self->update_pseudoword( 'html', 'encodedurl', $encoded, $oldurl ) if ( !defined( $noadd ) );
}
***************
*** 689,693 ****
$temp_after = "[\:]" if (defined $port);
! update_word( $self, $host, $encoded, $temp_before, $temp_after, $prefix);
# decided not to care about tld's beyond the verification performed when
--- 709,713 ----
$temp_after = "[\:]" if (defined $port);
! update_word( $self, $host, $encoded, $temp_before, $temp_after, $prefix) if ( !defined( $noadd ) );
# decided not to care about tld's beyond the verification performed when
***************
*** 698,702 ****
if ( $hostform eq "name" ) {
while ( $host =~ s/^([^\.])+\.(.*\.(.*))$/$2/ ) {
! update_word( $self, $2, $encoded, '[\.]', '[<]', $prefix);
}
}
--- 718,722 ----
if ( $hostform eq "name" ) {
while ( $host =~ s/^([^\.])+\.(.*\.(.*))$/$2/ ) {
! update_word( $self, $2, $encoded, '[\.]', '[<]', $prefix) if ( !defined( $noadd ) );
}
}
***************
*** 730,735 ****
# Remove HTML comments and other tags that begin !
! while ( $line =~ s/<!.*?>// ) {
! increment_word( $self, 'html:comment' );
print "$line\n" if $self->{debug};
}
--- 750,755 ----
# Remove HTML comments and other tags that begin !
! while ( $line =~ s/(<!.*?>)// ) {
! $self->update_pseudoword( 'html', 'comment', $encoded, $1 );
print "$line\n" if $self->{debug};
}
***************
*** 1157,1163 ****
if ($self->{color__}) {
# Remove over-reading
! $self->{ut__} = '';
!
! # Qeueue just this header for colorization
$self->{ut__} = splitline("$header: $argument\015\012", $encoding);
}
--- 1177,1183 ----
if ($self->{color__}) {
# Remove over-reading
! $self->{ut__} = '';
!
! # Qeueue just this header for colorization
$self->{ut__} = splitline("$header: $argument\015\012", $encoding);
}
***************
*** 1168,1175 ****
# much spam uses MIME-Version, MiME-Version and Mime-Version
! $self->update_pseudoword( 'header', $header );
# Check the encoding type in all RFC 2047 encoded headers
!
if ( $argument =~ /=\?(.{1,40})\?(Q|B)/i ) {
update_word( $self, $1, 0, '', '', 'charset' );
--- 1188,1195 ----
# much spam uses MIME-Version, MiME-Version and Mime-Version
! $self->update_pseudoword( 'header', $header, 0, $header );
# Check the encoding type in all RFC 2047 encoded headers
!
if ( $argument =~ /=\?(.{1,40})\?(Q|B)/i ) {
update_word( $self, $1, 0, '', '', 'charset' );
***************
*** 1178,1182 ****
# Handle the From, To and Cc headers and extract email addresses
# from them and treat them as words
-
# For certain headers we are going to mark them specially in the corpus
--- 1198,1201 ----
***************
*** 1282,1286 ****
my $compact_encoding = $encoding;
$compact_encoding =~ s/[^A-Za-z0-9]//g;
! increment_word( $self, "encoding:$compact_encoding" );
return ($mime, $encoding);
}
--- 1301,1305 ----
my $compact_encoding = $encoding;
$compact_encoding =~ s/[^A-Za-z0-9]//g;
! $self->update_pseudoword( 'encoding', $compact_encoding, 0, $encoding );
return ($mime, $encoding);
}
***************
*** 1289,1297 ****
return ($mime, $encoding) if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)$/i );
!
# Some headers should never be RFC 2047 decoded
!
$argument = $self->decode_string($argument) unless ($header =~ /^(Revceived|Content\-Type|Content\-Disposition)$/i);
!
add_line( $self, $argument, 0, $prefix );
--- 1308,1316 ----
return ($mime, $encoding) if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)$/i );
!
# Some headers should never be RFC 2047 decoded
!
$argument = $self->decode_string($argument) unless ($header =~ /^(Revceived|Content\-Type|Content\-Disposition)$/i);
!
add_line( $self, $argument, 0, $prefix );
***************
*** 1313,1317 ****
my ($line, $encoding) = @_;
$line =~ s/([^\r\n]{100,120} )/$1\r\n/g;
! $line =~ s/([^ \r\n]{120})/$1\r\n/g;
$line =~ s/</</g;
--- 1332,1336 ----
my ($line, $encoding) = @_;
$line =~ s/([^\r\n]{100,120} )/$1\r\n/g;
! $line =~ s/([^ \r\n]{120})/$1\r\n/g;
$line =~ s/</</g;
***************
*** 1324,1329 ****
$line =~ s/\t/ /g;
!
! return $line;
}
--- 1343,1348 ----
$line =~ s/\t/ /g;
!
! return $line;
}
|