[popfile-commit] engine/Classifier MailParse.pm,1.91.2.2,1.91.2.3

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv28186

Modified Files:
      Tag: v0/18/1
	MailParse.pm 
Log Message:
Fixes:
695565 Continued cc lines not colorized
702215 POPFile 0.18.1 fails do decode base64 attachments.
702316 0.18.1: nested QP parts - not decoded 

Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.91.2.2
retrieving revision 1.91.2.3
diff -C2 -d -r1.91.2.2 -r1.91.2.3
*** MailParse.pm	10 Apr 2003 22:11:23 -0000	1.91.2.2
--- MailParse.pm	11 Apr 2003 00:33:57 -0000	1.91.2.3
***************
*** 740,743 ****
--- 740,748 ----
      my $encoding = '';

+     # Variables to save header information to while parsing headers
+ 
+     my $header;
+     my $argument;
+ 
      # Clear the word hash

***************
*** 789,967 ****
              print ">>> $line" if $self->{debug};

!             if ( $self->{color} )  {
!                 my $splitline = $line;    
!                 $splitline =~ s/([^\r\n]{100,120} )/$1\r\n/g;
!                 $splitline =~ s/([^ \r\n]{120})/$1\r\n/g;

!                 if ( !$self->{in_html_tag} )  {
!                     $colorized .= $self->{ut} if ( $self->{ut} ne '' );
!                     
                      $self->{ut} = '';
!                 }
!                 
!                 #Escape some HTML characters to ensure display in HTML UI

!                 $splitline =~ s/</&lt;/g;
!                 $splitline =~ s/>/&gt;/g;
!                 
!                 if ( $encoding =~ /quoted\-printable/i ) {
!                     $splitline =~ s/=3C/&lt;/g;
!                     $splitline =~ s/=3E/&gt;/g;
!                 }

!                 $splitline =~ s/\t/&nbsp;&nbsp;&nbsp;&nbsp;/g;
!                 $self->{ut} .= $splitline;
!             }

-             if ($self->{in_headers}) {                
-             
                  # Check for blank line signifying end of headers
-                 
-                 if ( $line =~ /^(\r\n|\r|\n)/) {
-                     $self->{in_headers} = 0;
-                     print "Header parsing complete.\n" if $self->{debug};
-                 }
-                                                 
-                 # If we have an email header then just keep the part after the :
-                 
-                 if ( $line =~ /^([A-Za-z-]+):[ \t]*([^\n\r]*)/ )  {
-                     my $header   = $1;
-                     my $argument = $2;
-     
-                     print "Header ($header) ($argument)\n" if ($self->{debug});
-     
-                     # Handle the From, To and Cc headers and extract email addresses
-                     # from them and treat them as words

!                     # For certain headers we are going to mark them specially in the corpus
!                     # by tagging them with where they were found to help the classifier
!                     # do a better job.  So if you have
!                     #
!                     # From: fo...@ba...
!                     #
!                     # then we'll add from:fo...@ba... to the corpus and not just fo...@ba...

!                     my $prefix = '';
!     
!                     if ( $header =~ /^(From|To|Cc|Reply\-To)/i ) {
!                         
!                         # Concatenate multi-line fields (To, CC)
!                         
!                         if ( ( $header =~ /^To/i ) || ( $header =~ /^Cc/i ) ) {
!                             my $currpos = tell MSG;
!                             my $tempread = <MSG>;
!                             while ( $tempread =~ s/^[ \t]+(.*?)[\r\n]+// )  {
!                                 if ( $1 ne '' ) {
!                                     $argument .= $1;
!                                     $currpos = tell MSG;
!                                     $tempread = <MSG>;
!                                 } else {
!                                     last;
!                                 }
!                             }
!                             seek MSG, $currpos, 0;
!                             print "\n$header: [[$argument]]\n" if $self->{debug};
!                         }

!                         if ( $argument =~ /=\?(.{1,40})\?/ ) {
!                             update_word( $self, $1, 0, '', '', 'charset' );
!                         }
!                         
!                         if ( $header =~ /^From/i )  {
!                             $encoding     = '';
!                             $self->{content_type} = '';
!                             $self->{from} = $self->decode_string( $argument ) if ( $self->{from} eq '' ) ;
!                             $prefix = 'from';
!                         }

!                         $prefix = 'to' if ( $header =~ /^To/i );
!                         $self->{to} = $self->decode_string( $argument ) if ( ( $header =~ /^To/i ) && ( $self->{to} eq '' ) );
!                         
!                         $prefix = 'cc' if ( $header =~ /^Cc/i );
!                         $self->{cc} = $self->decode_string( $argument ) if ( ( $header =~ /^Cc/i ) && ( $self->{cc} eq '' ) );
!                         
!                         while ( $argument =~ s/<([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))>// )  {
!                             update_word($self, $1, 0, ';', '&',$prefix);
!                             add_url($self, $2, 0, '@', '[&<]',$prefix);
!                         }

-                         while ( $argument =~ s/([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+))// )  {
-                             update_word($self, $1, 0, '', '',$prefix);
-                             add_url($self, $2, 0, '@', '',$prefix);
-                         }
-     
-                         add_line( $self, $argument, 0, $prefix );
-                         next;
-                     }
-     
-                     $self->{subject} = $self->decode_string( $argument ) if ( ( $header =~ /^Subject/i ) && ( $self->{subject} eq '' ) );

!                     if ( $header =~ /^Subject/i ) {
!                         $prefix = 'subject';
!                     }

!                     $self->{date} = $argument if ( $header =~ /^Date/i );

!                     # Look for MIME 

!                     if ( $header =~ /^Content-Type/i ) { 
!                         if ( $argument =~ /multipart\//i ) { 
!                             my $boundary = $argument; 
!                                                         
!                             if ( !( $argument =~ /boundary=(\"([A-Z0-9\'\(\)\+\_\,\-\.\/\:\=\?][A-Z0-9\'\(\)\+_,\-\.\/:=\? ]{0,69})\"|([^\(\)\<\>\@\,\;\:\\\"\/\[\]\?\=]{1,70}))/i )) {
!                                 $boundary = <MSG>;
!                             }
!                             
!                             if ( $boundary =~ /boundary=(\"([A-Z0-9\'\(\)\+\_\,\-\.\/\:\=\?][A-Z0-9\'\(\)\+_,\-\.\/:=\? ]{0,69})\"|([^\(\)\<\>\@\,\;\:\\\"\/\[\]\?\=]{1,70}))/i ) { 
!                                                                 
!                                 $boundary = ($2 || $3);
!                                 
!                                 $boundary =~ s/(\+|\/|\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.)/\\$1/g;                                 
!                                                                 
!                                 if ($mime ne '') {
!                                     
!                                     # Fortunately the pipe character isn't a valid mime boundary character!
!                                     
!                                     $mime = join('|', $mime, $boundary);
!                                 } else {
!                                     $mime = $boundary;
!                                 }
!                                 print "Set mime boundary to " . $mime . "\n" if $self->{debug};
!                                 next;
!                             } 
!                         } 

!                         if ( $argument =~ /charset=\"?([^\"]{1,40})\"?/ ) {
!                             update_word( $self, $1, 0, '' , '', 'charset' );
!                         }
!                         
!                         if ( $argument =~ /^(.*?)(;$)/ ) {
!                             print "Set content type to $1\n" if $self->{debug};                        
!                             $self->{content_type} = $1; 
!                         }
!                         next; 
!                     }
!                         
!                     # Look for the different encodings in a MIME document, when we hit base64 we will
!                     # do a special parse here since words might be broken across the boundaries
!     
!                     if ( $header =~ /^Content-Transfer-Encoding/i ) {
!                         $encoding = $argument;
!                         print "Setting encoding to $encoding\n" if $self->{debug};
!                         my $compact_encoding = $encoding;
!                         $compact_encoding =~ s/[^A-Za-z0-9]//g;
!                         increment_word( $self, "encoding:$compact_encoding" );
!                         next;
!                     }
!     
!                     # Some headers to discard
!     
!                     next if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)/i );
!     
!                     add_line( $self, $argument, 0, $prefix );
!                     
                      next;
                  }
              }

--- 794,854 ----
              print ">>> $line" if $self->{debug};

!             if ($self->{color}) {

!                 if (!$self->{in_html_tag}) {
!                     $colorized .= $self->{ut};
                      $self->{ut} = '';
!                 }            
!     
!                 $self->{ut} .= splitline($line, $encoding);
!             }

!             if ($self->{in_headers}) {

+                 # temporary colorization while in headers is handled within parse_header

!                 $self->{ut} = '';

                  # Check for blank line signifying end of headers

!                 if ( $line =~ /^(\r\n|\r|\n)/) {
!                     
!                      # Parse the last header                                         
!                     ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding);                    

!                     # Clear the saved headers
!                     $header   = '';
!                     $argument = '';
!                     
!                     $self->{ut} .= splitline("\015\012", 0);

!                     $self->{in_headers} = 0;
!                     print "Header parsing complete.\n" if $self->{debug};

!                     next;
!                 }

!                 # If we have an email header then just keep the part after the :

!                 if ( $line =~ /^([A-Za-z-]+):[ \t]*([^\n\r]*)/ )  {

!                     # Parse the last header

!                     ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding) if ($header ne '');                    

!                     # Save the new information for the current header
! 
!                     $header   = $1;
!                     $argument = $2;
                      next;
                  }
+ 
+                 # Append to argument if the next line begins with whitespace (isn't a new header)
+                 
+                 if ( $line =~ /^([\t ].*?)(\r\n|\r|\n)/ ) {
+                     $argument .= "\015\012" . $1;
+                 }
+                 next;
              }

***************
*** 1074,1077 ****
--- 961,1117 ----
  # ---------------------------------------------------------------------------------------------
  #
+ # parse_header - Performs parsing operations on a message header
+ #
+ # $header       Name of header being processed
+ # $argument     Value of header being processed
+ # $mime         The presently saved mime boundaries list
+ # $encoding     Current message encoding
+ #
+ # ---------------------------------------------------------------------------------------------
+ sub parse_header
+ {
+     my ($self, $header, $argument, $mime, $encoding) = @_;
+ 
+     print "Header ($header) ($argument)\n" if ($self->{debug});
+     
+     # Remove over-reading
+     $self->{ut} = '';  
+     
+     # Qeueue just this header for colorization    
+     $self->{ut} = splitline("$header: $argument\015\012", $encoding);
+ 
+     # Check the encoding type in all RFC 2047 encoded headers
+     
+     if ( $argument =~ /=\?(.{1,40})\?(Q|B)/i ) {
+             update_word( $self, $1, 0, '', '', 'charset' );
+     }
+ 
+     # Handle the From, To and Cc headers and extract email addresses
+     # from them and treat them as words
+     
+ 
+     # For certain headers we are going to mark them specially in the corpus
+     # by tagging them with where they were found to help the classifier
+     # do a better job.  So if you have
+     #
+     # From: fo...@ba...
+     #
+     # then we'll add from:fo...@ba... to the corpus and not just fo...@ba...
+ 
+     my $prefix = '';
+ 
+     if ( $header =~ /^(From|To|Cc|Reply\-To)$/i ) {
+ 
+         # These headers at least can be decoded
+ 
+         $argument = $self->decode_string( $argument );
+ 
+         if ( $argument =~ /=\?(.{1,40})\?/ ) {
+             update_word( $self, $1, 0, '', '', 'charset' );
+         }
+ 
+         if ( $header =~ /^From$/i )  {
+             $encoding     = '';
+             $self->{content_type} = '';            
+             $self->{from} = $argument if ( $self->{from} eq '' ) ;
+             $prefix = 'from';
+         }
+ 
+         if ( $header =~ /^To$/i ) {
+             $prefix = 'to';
+             $self->{to} = $argument if ( $self->{to} eq '' );
+         }
+ 
+         if ( $header =~ /^Cc$/i ) {
+             $prefix = 'cc';
+             $self->{cc} = $argument if ( $self->{cc} eq '' );
+         }
+ 
+         while ( $argument =~ s/<([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))>// )  {
+             update_word($self, $1, 0, ';', '&',$prefix);
+             add_url($self, $2, 0, '@', '[&<]',$prefix);
+         }
+ 
+         while ( $argument =~ s/([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+))// )  {
+             update_word($self, $1, 0, '', '',$prefix);
+             add_url($self, $2, 0, '@', '',$prefix);
+         }
+ 
+         add_line( $self, $argument, 0, $prefix );
+         return ($mime, $encoding);
+     }
+ 
+     if ( $header =~ /^Subject$/i ) {
+         $prefix = 'subject';
+         $argument = $self->decode_string( $argument );
+         $self->{subject} = $argument if ( ( $self->{subject} eq '' ) );
+     }
+ 
+     $self->{date} = $argument if ( $header =~ /^Date/i );
+ 
+     # Look for MIME
+ 
+     if ( $header =~ /^Content-Type$/i ) {
+         
+         if ( $argument =~ /charset=\"?([^\"]{1,40})\"?/ ) {
+             update_word( $self, $1, 0, '' , '', 'charset' );
+         }
+ 
+         if ( $argument =~ /^(.*?)(;)/ ) {
+             print "Set content type to $1\n" if $self->{debug};
+             $self->{content_type} = $1;
+         }
+         
+         if ( $argument =~ /multipart\//i ) {            
+             my $boundary = $argument;
+ 
+             if ( $boundary =~ /boundary= ?(\"([A-Z0-9\'\(\)\+\_\,\-\.\/\:\=\?][A-Z0-9\'\(\)\+_,\-\.\/:=\? ]{0,69})\"|([^\(\)\<\>\@\,\;\:\\\"\/\[\]\?\=]{1,70}))/i ) {
+ 
+                 $boundary = ($2 || $3);
+ 
+                 $boundary =~ s/(.*)/\Q$1\E/g;
+ 
+                 if ($mime ne '') {
+ 
+                     # Fortunately the pipe character isn't a valid mime boundary character!
+ 
+                     $mime = join('|', $mime, $boundary);
+                 } else {
+                     $mime = $boundary;
+                 }
+                 print "Set mime boundary to " . $mime . "\n" if $self->{debug};
+                 return ($mime, $encoding);
+             }
+         }
+         return ($mime, $encoding);
+     }
+ 
+     # Look for the different encodings in a MIME document, when we hit base64 we will
+     # do a special parse here since words might be broken across the boundaries
+ 
+     if ( $header =~ /^Content-Transfer-Encoding$/i ) {
+         $encoding = $argument;
+         print "Setting encoding to $encoding\n" if $self->{debug};
+         my $compact_encoding = $encoding;
+         $compact_encoding =~ s/[^A-Za-z0-9]//g;
+         increment_word( $self, "encoding:$compact_encoding" );
+         return ($mime, $encoding);
+     }
+ 
+     # Some headers to discard
+ 
+     return ($mime, $encoding) if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)$/i );
+     
+     # Some headers should never be RFC 2047 decoded
+     
+     $argument = $self->decode_string($argument) unless ($header =~ /^(Revceived|Content\-Type|Content\-Disposition)$/i);
+         
+     add_line( $self, $argument, 0, $prefix );
+ 
+     return ($mime, $encoding);
+ }
+ 
+ # ---------------------------------------------------------------------------------------------
+ #
  # clear_out_base64
  #
***************
*** 1149,1152 ****
--- 1189,1219 ----
  }

+ # ---------------------------------------------------------------------------------------------
+ #
+ # splitline - Escapes characters so a line will print as plain-text within a HTML document.
+ #
+ # $line         The line to escape
+ # $encoding     The value of any current encoding scheme
+ #
+ # ---------------------------------------------------------------------------------------------
+ 
+ sub splitline
+ {
+     my ($line, $encoding) = @_;
+     $line =~ s/([^\r\n]{100,120} )/$1\r\n/g;
+     $line =~ s/([^ \r\n]{120})/$1\r\n/g;        
+ 
+     $line =~ s/</&lt;/g;
+     $line =~ s/>/&gt;/g;
+ 
+     if ( $encoding =~ /quoted\-printable/i ) {
+         $line =~ s/=3C/&lt;/g;
+         $line =~ s/=3E/&gt;/g;
+     }
+ 
+     $line =~ s/\t/&nbsp;&nbsp;&nbsp;&nbsp;/g;
+     
+     return $line;        
+ }

  1;