Thread: [popfile-commit] engine/Classifier MailParse.pm,1.123,1.124

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv9618/Classifier

Modified Files:
	MailParse.pm 
Log Message:
Numerous updates to the mail parser to make colorization work for pseudowords, and fix various bugs

Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.123
retrieving revision 1.124
diff -C2 -d -r1.123 -r1.124
*** MailParse.pm	30 Apr 2003 03:13:36 -0000	1.123
--- MailParse.pm	5 May 2003 17:50:10 -0000	1.124
***************
*** 70,73 ****
--- 70,74 ----

      $self->{htmlbackcolor__} = map_color( $self, 'white' );
+     $self->{htmlbodycolor__} = map_color( $self, 'white' );
      $self->{htmlfontcolor__} = map_color( $self, 'black' );

***************
*** 147,150 ****
--- 148,153 ----
  # $prefix       The pseudoword prefix (e.g. header)
  # $word         The pseudoword (e.g. Mime-Version)
+ # $encoded      Whether this was found inside encoded text
+ # $literal      The literal text that generated this pseudoword
  #
  # ---------------------------------------------------------------------------------------------
***************
*** 152,158 ****
  sub update_pseudoword
  {
!     my ( $self, $prefix, $word ) = @_;

!     $self->increment_word( "$prefix:$word" );
  }

--- 155,175 ----
  sub update_pseudoword
  {
!     my ( $self, $prefix, $word, $encoded, $literal ) = @_;

!     my $mword = "$prefix:$word";
! 
!     if ( $self->{color__} ) {
!         $literal =~ s/</&lt;/g;
!         $literal =~ s/>/&gt;/g;
!         my $color = $self->{bayes__}->get_color($mword);
!         my $to    = "<b><font color=\"$color\"><a title=\"$mword\">$literal</a></font></b>";
!         if ( $encoded == 0 )  {
! 	    $self->{ut__} =~ s/\Q$literal\E/$to/g;
!         } else {
!             $self->{ut__} .= $to . ' ';
!         }
!     } else {
!         $self->increment_word( $mword );
!     }
  }

***************
*** 194,198 ****
                  $self->{ut__} .= "<font color=\"$color\">$word<\/font> ";
              }
- 
          } else {
              increment_word( $self, $mword );
--- 211,214 ----
***************
*** 259,263 ****
                          $self->{ut__} =~ s/$from/$to/g;
                          print "$from -> $to\n" if $self->{debug};
!                         increment_word( $self, 'html:numericentity' );
                      }
                  }
--- 275,279 ----
                          $self->{ut__} =~ s/$from/$to/g;
                          print "$from -> $to\n" if $self->{debug};
!                         $self->update_pseudoword( 'html', 'numericentity', $encoded, $from );
                      }
                  }
***************
*** 283,297 ****

              # Deal with runs of alternating spaces and letters
-             # TODO: find a way to make this (and other similar stuff) highlight
-             #       without using the encoded content printer or modifying $self->{ut__}

!             foreach my $space (' ', '\'', '*', '^', '`', '  ', '\38' ){
!                 while ( $line =~ s/( |^)(([A-Z]\Q$space\E){2,15}[A-Z])( |\Q$space\E|[!\?])/ /i ) {
                      my $word = $2;
                      print "$word ->" if $self->{debug};
!                     $word    =~ s/\Q$space\E//g;
                      print "$word\n" if $self->{debug};
!                     update_word( $self, $word, $encoded, ' ', ' ', $prefix);
!                     increment_word( $self, 'trick:spacedout' );
                  }
              }
--- 299,312 ----

              # Deal with runs of alternating spaces and letters

!             foreach my $space (' ', '\'', '*', '^', '`', '  ', '\38', '.' ){
!                 while ( $line =~ s/( |^)(([A-Z]\Q$space\E){2,15}[A-Z])( |\Q$space\E|[!\?,])/ /i ) {
!                     my $original = "$1$2$4";
                      my $word = $2;
                      print "$word ->" if $self->{debug};
!                     $word    =~ s/[^A-Z]//gi;
                      print "$word\n" if $self->{debug};
!                     $self->update_word( $word, $encoded, ' ', ' ', $prefix);
!                     $self->update_pseudoword( 'trick', 'spacedout', $encoded, $original );
                  }
              }
***************
*** 300,304 ****

              while ( $line =~ s/ ([A-Z]+)\.([A-Z]{2,}) / $1$2 /i ) {
!                 increment_word( $self, 'trick:dottedwords' );
              }

--- 315,319 ----

              while ( $line =~ s/ ([A-Z]+)\.([A-Z]{2,}) / $1$2 /i ) {
!                 $self->update_pseudoword( 'trick', 'dottedwords', $encoded, "$1$2" );
              }

***************
*** 320,324 ****
      } else {
          if ( $bigline ne '' ) {
!     	    $self->increment_word( 'trick:invisibleink' );
  	}
      }
--- 335,339 ----
      } else {
          if ( $bigline ne '' ) {
!             $self->update_pseudoword( 'trick', 'invisibleink', $encoded, $bigline );
  	}
      }
***************
*** 355,370 ****
          }

-         return;
-     }
- 
  	# If we hit a table tag then any font information is lost

  	if ( $tag =~ /^(table|td|tr|th)$/i ) {
! 		$self->{htmlfontcolor__} = map_color( $self, 'black' );
! 		$self->{htmlbackcolor__} = map_color( $self, 'white' );
  	}

! 	# Count the number of TD elements
! 	increment_word( $self, 'html:td' ) if ( $tag =~ /^td$/i );

      my $attribute;
--- 370,385 ----
          }

  	# If we hit a table tag then any font information is lost

  	if ( $tag =~ /^(table|td|tr|th)$/i ) {
! 	    $self->{htmlfontcolor__} = map_color( $self, 'black' );
! 	    $self->{htmlbackcolor__} = $self->{htmlbodycolor__};
  	}

!         return;
!     }
! 
!     # Count the number of TD elements
!     $self->update_pseudoword('html', 'td', $encoded, $tag ) if ( $tag =~ /^td$/i );

      my $attribute;
***************
*** 381,392 ****
      # match a space or > or EOL

!     while ( $arg =~ s/[ \t]*(\w+)[ \t]*=[ \t]*([\"\'])?(.*?)(?(2)\2|($|([ \t>])))//i ) {
!         $attribute = $1;
!         $value     = $3;
          $quote     = '';
          $end_quote = '[\> \t\&\n]';
!         if (defined $2) {
!             $quote     = $2;
!             $end_quote = $2;
          }

--- 396,410 ----
      # match a space or > or EOL

!     my $original;
! 
!     while ( $arg =~ s/[ \t]*((\w+)[ \t]*=[ \t]*([\"\'])?(.*?)(\3|($|([ \t>]))))//i ) {
!         $original  = $1;
!         $attribute = $2;
!         $value     = $4;
          $quote     = '';
          $end_quote = '[\> \t\&\n]';
!         if (defined $3) {
!             $quote     = $3;
!             $end_quote = $3;
          }

***************
*** 410,414 ****
               ( ( $tag =~ /^img|frame|iframe$/i )
                 || ( $tag =~ /^script$/i && $parse_script_uri ) ) ) {
!                 
              # "CID:" links refer to an origin-controlled attachment to a html email.
              # Adding strings from these, even if they appear to be hostnames, may or
--- 428,432 ----
               ( ( $tag =~ /^img|frame|iframe$/i )
                 || ( $tag =~ /^script$/i && $parse_script_uri ) ) ) {
! 
              # "CID:" links refer to an origin-controlled attachment to a html email.
              # Adding strings from these, even if they appear to be hostnames, may or
***************
*** 418,425 ****
              {
                  # TODO: Decide what to do here, ignoring CID's for now
-                 
              } else {

!                 my $host = add_url( $self, $value, $encoded, $quote, $end_quote, '' );                

                  # If the host name is not blank (i.e. there was a hostname in the url
--- 436,442 ----
              {
                  # TODO: Decide what to do here, ignoring CID's for now
              } else {

!                 my $host = add_url( $self, $value, $encoded, $quote, $end_quote, '', 1 );

                  # If the host name is not blank (i.e. there was a hostname in the url
***************
*** 428,438 ****

                  if ( ( $host ne '' ) && ( $tag =~ /^img$/i ) ) {
!         	        if ( $host ne 'localhost' ) {
!                         $self->update_pseudoword( 'html', 'imgremotesrc' );
!         	        }
                  }
              }
-             

              next;
          }
--- 445,455 ----

                  if ( ( $host ne '' ) && ( $tag =~ /^img$/i ) ) {
!         	    if ( $host ne 'localhost' ) {
!                         $self->update_pseudoword( 'html', 'imgremotesrc', $encoded, $original );
!         	    }
                  }
              }

+             add_url( $self, $value, $encoded, $quote, $end_quote, '' );
              next;
          }
***************
*** 499,503 ****
          if ( ( $attribute =~ /^(width|height)$/i ) && ( $tag =~ /^img$/i ) ) {
              $attribute = lc( $attribute );
!             $self->update_pseudoword( 'html', "img$attribute$value" );
          }

--- 516,520 ----
          if ( ( $attribute =~ /^(width|height)$/i ) && ( $tag =~ /^img$/i ) ) {
              $attribute = lc( $attribute );
!             $self->update_pseudoword( 'html', "img$attribute$value", $encoded, $original );
          }

***************
*** 505,509 ****

          if ( ( $attribute =~ /^size$/i ) && ( $tag =~ /^font$/i ) ) {
!             $self->update_pseudoword( 'html', "fontsize$value" );
          }

--- 522,526 ----

          if ( ( $attribute =~ /^size$/i ) && ( $tag =~ /^font$/i ) ) {
!             $self->update_pseudoword( 'html', "fontsize$value", $encoded, $original );
          }

***************
*** 514,517 ****
--- 531,536 ----
              $self->{htmlbackcolor__} = map_color($self, $value);
  			print "Set html back color to $self->{htmlbackcolor__}\n" if ( $self->{debug} );
+ 
+             $self->{htmlbodycolor__} = $self->{htmlbackcolor__} if ( $tag =~ /^body$/i );
          }

***************
*** 564,567 ****
--- 583,587 ----
  # $prefix       A string to prefix any words with in the corpus, used for the special
  #               identification of values found in for example the subject line
+ # $noadd        If defined indicates that only parsing should be done, no word updates
  #
  # Returns the hostname
***************
*** 570,574 ****
  sub add_url
  {
!     my ($self, $url, $encoded, $before, $after, $prefix) = @_;

      my $temp_url = $url;
--- 590,594 ----
  sub add_url
  {
!     my ($self, $url, $encoded, $before, $after, $prefix, $noadd) = @_;

      my $temp_url = $url;
***************
*** 592,600 ****
      # Remove any URL encoding (protocol may not be URL encoded)

!     if ( $url =~ s/(\%([0-9A-Fa-f][0-9A-Fa-f]))/chr(hex("0x$2"))/eg ) {
!         increment_word( $self, "html:encodedurl" );
!         my $new_url = (defined $protocol?"$protocol://":'') . $url;
!         print "$temp_url -> " . $new_url . "\n" if $self->{debug};
!         $self->{ut__} =~ s/$temp_url/$new_url/e if $self->{color__};
      }

--- 612,620 ----
      # Remove any URL encoding (protocol may not be URL encoded)

!     my $oldurl   = $url;
!     my $percents =  ( $url =~ s/(%([0-9A-Fa-f]{2}))/chr(hex("0x$2"))/ge );
! 
!     if ( $percents > 0 ) {
!         $self->update_pseudoword( 'html', 'encodedurl', $encoded, $oldurl ) if ( !defined( $noadd ) );
      }

***************
*** 689,693 ****
          $temp_after = "[\:]" if (defined $port);

!         update_word( $self, $host, $encoded, $temp_before, $temp_after, $prefix);

          # decided not to care about tld's beyond the verification performed when
--- 709,713 ----
          $temp_after = "[\:]" if (defined $port);

!         update_word( $self, $host, $encoded, $temp_before, $temp_after, $prefix) if ( !defined( $noadd ) );

          # decided not to care about tld's beyond the verification performed when
***************
*** 698,702 ****
          if ( $hostform eq "name" ) {
              while ( $host =~ s/^([^\.])+\.(.*\.(.*))$/$2/ ) {
!                 update_word( $self, $2, $encoded, '[\.]', '[<]', $prefix);
              }
          }
--- 718,722 ----
          if ( $hostform eq "name" ) {
              while ( $host =~ s/^([^\.])+\.(.*\.(.*))$/$2/ ) {
!                 update_word( $self, $2, $encoded, '[\.]', '[<]', $prefix) if ( !defined( $noadd ) );
              }
          }
***************
*** 730,735 ****
      # Remove HTML comments and other tags that begin !

!     while ( $line =~ s/<!.*?>// ) {
!         increment_word( $self, 'html:comment' );
          print "$line\n" if $self->{debug};
      }
--- 750,755 ----
      # Remove HTML comments and other tags that begin !

!     while ( $line =~ s/(<!.*?>)// ) {
!         $self->update_pseudoword( 'html', 'comment', $encoded, $1 );
          print "$line\n" if $self->{debug};
      }
***************
*** 1157,1163 ****
      if ($self->{color__}) {
          # Remove over-reading
!         $self->{ut__} = '';  
!         
!         # Qeueue just this header for colorization    
          $self->{ut__} = splitline("$header: $argument\015\012", $encoding);
      }
--- 1177,1183 ----
      if ($self->{color__}) {
          # Remove over-reading
!         $self->{ut__} = '';
! 
!         # Qeueue just this header for colorization
          $self->{ut__} = splitline("$header: $argument\015\012", $encoding);
      }
***************
*** 1168,1175 ****
      # much spam uses MIME-Version, MiME-Version and Mime-Version

!     $self->update_pseudoword( 'header', $header );

      # Check the encoding type in all RFC 2047 encoded headers
!     
      if ( $argument =~ /=\?(.{1,40})\?(Q|B)/i ) {
              update_word( $self, $1, 0, '', '', 'charset' );
--- 1188,1195 ----
      # much spam uses MIME-Version, MiME-Version and Mime-Version

!     $self->update_pseudoword( 'header', $header, 0, $header );

      # Check the encoding type in all RFC 2047 encoded headers
! 
      if ( $argument =~ /=\?(.{1,40})\?(Q|B)/i ) {
              update_word( $self, $1, 0, '', '', 'charset' );
***************
*** 1178,1182 ****
      # Handle the From, To and Cc headers and extract email addresses
      # from them and treat them as words
-     

      # For certain headers we are going to mark them specially in the corpus
--- 1198,1201 ----
***************
*** 1282,1286 ****
          my $compact_encoding = $encoding;
          $compact_encoding =~ s/[^A-Za-z0-9]//g;
!         increment_word( $self, "encoding:$compact_encoding" );
          return ($mime, $encoding);
      }
--- 1301,1305 ----
          my $compact_encoding = $encoding;
          $compact_encoding =~ s/[^A-Za-z0-9]//g;
!         $self->update_pseudoword( 'encoding', $compact_encoding, 0, $encoding );
          return ($mime, $encoding);
      }
***************
*** 1289,1297 ****

      return ($mime, $encoding) if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)$/i );
!     
      # Some headers should never be RFC 2047 decoded
!     
      $argument = $self->decode_string($argument) unless ($header =~ /^(Revceived|Content\-Type|Content\-Disposition)$/i);
!         
      add_line( $self, $argument, 0, $prefix );

--- 1308,1316 ----

      return ($mime, $encoding) if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)$/i );
! 
      # Some headers should never be RFC 2047 decoded
! 
      $argument = $self->decode_string($argument) unless ($header =~ /^(Revceived|Content\-Type|Content\-Disposition)$/i);
! 
      add_line( $self, $argument, 0, $prefix );

***************
*** 1313,1317 ****
      my ($line, $encoding) = @_;
      $line =~ s/([^\r\n]{100,120} )/$1\r\n/g;
!     $line =~ s/([^ \r\n]{120})/$1\r\n/g;        

      $line =~ s/</&lt;/g;
--- 1332,1336 ----
      my ($line, $encoding) = @_;
      $line =~ s/([^\r\n]{100,120} )/$1\r\n/g;
!     $line =~ s/([^ \r\n]{120})/$1\r\n/g;

      $line =~ s/</&lt;/g;
***************
*** 1324,1329 ****

      $line =~ s/\t/&nbsp;&nbsp;&nbsp;&nbsp;/g;
!     
!     return $line;        
  }

--- 1343,1348 ----

      $line =~ s/\t/&nbsp;&nbsp;&nbsp;&nbsp;/g;
! 
!     return $line;
  }