|
From: <jgr...@us...> - 2003-06-18 20:34:58
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv28368/Classifier
Modified Files:
MailParse.pm
Log Message:
Restored POPFile test suite to working order, note that many tests are currently failing and need to be udpated, I am on this. Also added code to detect the Camouflage spam trick by using Pythagoras on the two RGB values
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.134
retrieving revision 1.135
diff -C2 -d -r1.134 -r1.135
*** MailParse.pm 17 Jun 2003 17:21:38 -0000 1.134
--- MailParse.pm 18 Jun 2003 20:34:55 -0000 1.135
***************
*** 73,76 ****
--- 73,82 ----
$self->{htmlfontcolor__} = map_color( $self, 'black' );
+ # This is the distance betwee the back color and the font color
+ # as computed using compute_rgb_distance
+
+ $self->{htmlcolordistance__} = 0;
+ compute_html_color_distance( $self );
+
# This is a mapping between HTML color names and HTML hexadecimal color values used by the
# map_color value to get canonical color values
***************
*** 115,118 ****
--- 121,170 ----
# ---------------------------------------------------------------------------------------------
#
+ # compute_rgb_distance
+ #
+ # Given two RGB colors compute the distance between them by considering them as points
+ # in 3 dimensions and calculating the distance between them (or equivalently the length
+ # of a vector between them)
+ #
+ # $left One color
+ # $right The other color
+ #
+ # ---------------------------------------------------------------------------------------------
+ sub compute_rgb_distance
+ {
+ my ( $self, $left, $right ) = @_;
+
+ # Figure out where the left color is and then subtract the right
+ # color (point from it) to get the vector
+
+ $left =~ /^(..)(..)(..)$/;
+ my ( $rl, $gl, $bl ) = ( hex($1), hex($2), hex($3) );
+
+ $right =~ /^(..)(..)(..)$/;
+ my ( $r, $g, $b ) = ( $rl - hex($1), $gl - hex($2), $bl - hex($3) );
+
+ # Now apply Pythagoras in 3D to get the distance between them, we return
+ # the int because we don't need decimal level accuracy
+
+ return int( sqrt( $r*$r + $g*$g + $b*$b ) );
+ }
+
+ # ---------------------------------------------------------------------------------------------
+ #
+ # compute_html_color_distance
+ #
+ # Calls compute_rgb_distance to set up htmlcolordistance__ from the current HTML back and
+ # font colors
+ #
+ # ---------------------------------------------------------------------------------------------
+ sub compute_html_color_distance
+ {
+ my ( $self ) = @_;
+ $self->{htmlcolordistance__} = compute_rgb_distance( $self->{htmlfontcolor__},
+ $self->{htmlbackcolor__} );
+ }
+
+ # ---------------------------------------------------------------------------------------------
+ #
# map_color
#
***************
*** 174,178 ****
#
# ---------------------------------------------------------------------------------------------
-
sub update_pseudoword
{
--- 226,229 ----
***************
*** 262,265 ****
--- 313,329 ----
if ( $self->{htmlfontcolor__} ne $self->{htmlbackcolor__} ) {
+
+ # If we are adding a line and the colors are different then we will
+ # add a count for the color difference to make sure that we catch
+ # camouflage attacks using similar colors, if the color similarity
+ # is less than 100. I chose 100 somewhat arbitrarily but classic
+ # black text on white background has a distance of 441, red/blue or
+ # green on white has distance 255. 100 seems like a reasonable upper
+ # bound for tracking evil spammer tricks with similar colors
+
+ if ( $self->{htmlcolordistance__} < 100 ) {
+ $self->update_pseudoword( 'html', "colordistance$self->{htmlcolordistance__}", $encoded, '' );
+ }
+
while ( $p < length($bigline) ) {
my $line = substr($bigline, $p, 1024);
***************
*** 388,391 ****
--- 452,456 ----
if ( $tag =~ /^font$/i ) {
$self->{htmlfontcolor__} = map_color( $self, 'black' );
+ $self->compute_html_color_distance();
}
***************
*** 395,398 ****
--- 460,464 ----
$self->{htmlfontcolor__} = map_color( $self, 'black' );
$self->{htmlbackcolor__} = $self->{htmlbodycolor__};
+ $self->compute_html_color_distance();
}
***************
*** 526,529 ****
--- 592,596 ----
$self->update_pseudoword( 'html', "fontcolor$value", $encoded, $original );
$self->{htmlfontcolor__} = map_color($self, $value);
+ $self->compute_html_color_distance();
print "Set html font color to $self->{htmlfontcolor__}\n" if ( $self->{debug} );
}
***************
*** 533,536 ****
--- 600,604 ----
update_word( $self, $value, $encoded, $quote, $end_quote, '' );
$self->{htmlfontcolor__} = map_color($self, $value);
+ $self->compute_html_color_distance();
print "Set html font color to $self->{htmlfontcolor__}\n" if ( $self->{debug} );
}
***************
*** 558,561 ****
--- 626,630 ----
$self->{htmlbodycolor__} = $self->{htmlbackcolor__} if ( $tag =~ /^body$/i );
+ $self->compute_html_color_distance();
}
***************
*** 897,900 ****
--- 966,970 ----
$self->{htmlbackcolor__} = map_color( $self, 'white' );
$self->{htmlfontcolor__} = map_color( $self, 'black' );
+ $self->compute_html_color_distance();
$self->{in_headers__} = 1;
|