From: <ssc...@us...> - 2003-04-11 00:34:00
|
Update of /cvsroot/popfile/engine/Classifier In directory sc8-pr-cvs1:/tmp/cvs-serv28186 Modified Files: Tag: v0/18/1 MailParse.pm Log Message: Fixes: 695565 Continued cc lines not colorized 702215 POPFile 0.18.1 fails do decode base64 attachments. 702316 0.18.1: nested QP parts - not decoded Index: MailParse.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v retrieving revision 1.91.2.2 retrieving revision 1.91.2.3 diff -C2 -d -r1.91.2.2 -r1.91.2.3 *** MailParse.pm 10 Apr 2003 22:11:23 -0000 1.91.2.2 --- MailParse.pm 11 Apr 2003 00:33:57 -0000 1.91.2.3 *************** *** 740,743 **** --- 740,748 ---- my $encoding = ''; + # Variables to save header information to while parsing headers + + my $header; + my $argument; + # Clear the word hash *************** *** 789,967 **** print ">>> $line" if $self->{debug}; ! if ( $self->{color} ) { ! my $splitline = $line; ! $splitline =~ s/([^\r\n]{100,120} )/$1\r\n/g; ! $splitline =~ s/([^ \r\n]{120})/$1\r\n/g; ! if ( !$self->{in_html_tag} ) { ! $colorized .= $self->{ut} if ( $self->{ut} ne '' ); ! $self->{ut} = ''; ! } ! ! #Escape some HTML characters to ensure display in HTML UI ! $splitline =~ s/</</g; ! $splitline =~ s/>/>/g; ! ! if ( $encoding =~ /quoted\-printable/i ) { ! $splitline =~ s/=3C/</g; ! $splitline =~ s/=3E/>/g; ! } ! $splitline =~ s/\t/ /g; ! $self->{ut} .= $splitline; ! } - if ($self->{in_headers}) { - # Check for blank line signifying end of headers - - if ( $line =~ /^(\r\n|\r|\n)/) { - $self->{in_headers} = 0; - print "Header parsing complete.\n" if $self->{debug}; - } - - # If we have an email header then just keep the part after the : - - if ( $line =~ /^([A-Za-z-]+):[ \t]*([^\n\r]*)/ ) { - my $header = $1; - my $argument = $2; - - print "Header ($header) ($argument)\n" if ($self->{debug}); - - # Handle the From, To and Cc headers and extract email addresses - # from them and treat them as words ! # For certain headers we are going to mark them specially in the corpus ! # by tagging them with where they were found to help the classifier ! # do a better job. So if you have ! # ! # From: fo...@ba... ! # ! # then we'll add from:fo...@ba... to the corpus and not just fo...@ba... ! my $prefix = ''; ! ! if ( $header =~ /^(From|To|Cc|Reply\-To)/i ) { ! ! # Concatenate multi-line fields (To, CC) ! ! if ( ( $header =~ /^To/i ) || ( $header =~ /^Cc/i ) ) { ! my $currpos = tell MSG; ! my $tempread = <MSG>; ! while ( $tempread =~ s/^[ \t]+(.*?)[\r\n]+// ) { ! if ( $1 ne '' ) { ! $argument .= $1; ! $currpos = tell MSG; ! $tempread = <MSG>; ! } else { ! last; ! } ! } ! seek MSG, $currpos, 0; ! print "\n$header: [[$argument]]\n" if $self->{debug}; ! } ! if ( $argument =~ /=\?(.{1,40})\?/ ) { ! update_word( $self, $1, 0, '', '', 'charset' ); ! } ! ! if ( $header =~ /^From/i ) { ! $encoding = ''; ! $self->{content_type} = ''; ! $self->{from} = $self->decode_string( $argument ) if ( $self->{from} eq '' ) ; ! $prefix = 'from'; ! } ! $prefix = 'to' if ( $header =~ /^To/i ); ! $self->{to} = $self->decode_string( $argument ) if ( ( $header =~ /^To/i ) && ( $self->{to} eq '' ) ); ! ! $prefix = 'cc' if ( $header =~ /^Cc/i ); ! $self->{cc} = $self->decode_string( $argument ) if ( ( $header =~ /^Cc/i ) && ( $self->{cc} eq '' ) ); ! ! while ( $argument =~ s/<([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))>// ) { ! update_word($self, $1, 0, ';', '&',$prefix); ! add_url($self, $2, 0, '@', '[&<]',$prefix); ! } - while ( $argument =~ s/([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+))// ) { - update_word($self, $1, 0, '', '',$prefix); - add_url($self, $2, 0, '@', '',$prefix); - } - - add_line( $self, $argument, 0, $prefix ); - next; - } - - $self->{subject} = $self->decode_string( $argument ) if ( ( $header =~ /^Subject/i ) && ( $self->{subject} eq '' ) ); ! if ( $header =~ /^Subject/i ) { ! $prefix = 'subject'; ! } ! $self->{date} = $argument if ( $header =~ /^Date/i ); ! # Look for MIME ! if ( $header =~ /^Content-Type/i ) { ! if ( $argument =~ /multipart\//i ) { ! my $boundary = $argument; ! ! if ( !( $argument =~ /boundary=(\"([A-Z0-9\'\(\)\+\_\,\-\.\/\:\=\?][A-Z0-9\'\(\)\+_,\-\.\/:=\? ]{0,69})\"|([^\(\)\<\>\@\,\;\:\\\"\/\[\]\?\=]{1,70}))/i )) { ! $boundary = <MSG>; ! } ! ! if ( $boundary =~ /boundary=(\"([A-Z0-9\'\(\)\+\_\,\-\.\/\:\=\?][A-Z0-9\'\(\)\+_,\-\.\/:=\? ]{0,69})\"|([^\(\)\<\>\@\,\;\:\\\"\/\[\]\?\=]{1,70}))/i ) { ! ! $boundary = ($2 || $3); ! ! $boundary =~ s/(\+|\/|\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.)/\\$1/g; ! ! if ($mime ne '') { ! ! # Fortunately the pipe character isn't a valid mime boundary character! ! ! $mime = join('|', $mime, $boundary); ! } else { ! $mime = $boundary; ! } ! print "Set mime boundary to " . $mime . "\n" if $self->{debug}; ! next; ! } ! } ! if ( $argument =~ /charset=\"?([^\"]{1,40})\"?/ ) { ! update_word( $self, $1, 0, '' , '', 'charset' ); ! } ! ! if ( $argument =~ /^(.*?)(;$)/ ) { ! print "Set content type to $1\n" if $self->{debug}; ! $self->{content_type} = $1; ! } ! next; ! } ! ! # Look for the different encodings in a MIME document, when we hit base64 we will ! # do a special parse here since words might be broken across the boundaries ! ! if ( $header =~ /^Content-Transfer-Encoding/i ) { ! $encoding = $argument; ! print "Setting encoding to $encoding\n" if $self->{debug}; ! my $compact_encoding = $encoding; ! $compact_encoding =~ s/[^A-Za-z0-9]//g; ! increment_word( $self, "encoding:$compact_encoding" ); ! next; ! } ! ! # Some headers to discard ! ! next if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)/i ); ! ! add_line( $self, $argument, 0, $prefix ); ! next; } } --- 794,854 ---- print ">>> $line" if $self->{debug}; ! if ($self->{color}) { ! if (!$self->{in_html_tag}) { ! $colorized .= $self->{ut}; $self->{ut} = ''; ! } ! ! $self->{ut} .= splitline($line, $encoding); ! } ! if ($self->{in_headers}) { + # temporary colorization while in headers is handled within parse_header ! $self->{ut} = ''; # Check for blank line signifying end of headers ! if ( $line =~ /^(\r\n|\r|\n)/) { ! ! # Parse the last header ! ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding); ! # Clear the saved headers ! $header = ''; ! $argument = ''; ! ! $self->{ut} .= splitline("\015\012", 0); ! $self->{in_headers} = 0; ! print "Header parsing complete.\n" if $self->{debug}; ! next; ! } ! # If we have an email header then just keep the part after the : ! if ( $line =~ /^([A-Za-z-]+):[ \t]*([^\n\r]*)/ ) { ! # Parse the last header ! ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding) if ($header ne ''); ! # Save the new information for the current header ! ! $header = $1; ! $argument = $2; next; } + + # Append to argument if the next line begins with whitespace (isn't a new header) + + if ( $line =~ /^([\t ].*?)(\r\n|\r|\n)/ ) { + $argument .= "\015\012" . $1; + } + next; } *************** *** 1074,1077 **** --- 961,1117 ---- # --------------------------------------------------------------------------------------------- # + # parse_header - Performs parsing operations on a message header + # + # $header Name of header being processed + # $argument Value of header being processed + # $mime The presently saved mime boundaries list + # $encoding Current message encoding + # + # --------------------------------------------------------------------------------------------- + sub parse_header + { + my ($self, $header, $argument, $mime, $encoding) = @_; + + print "Header ($header) ($argument)\n" if ($self->{debug}); + + # Remove over-reading + $self->{ut} = ''; + + # Qeueue just this header for colorization + $self->{ut} = splitline("$header: $argument\015\012", $encoding); + + # Check the encoding type in all RFC 2047 encoded headers + + if ( $argument =~ /=\?(.{1,40})\?(Q|B)/i ) { + update_word( $self, $1, 0, '', '', 'charset' ); + } + + # Handle the From, To and Cc headers and extract email addresses + # from them and treat them as words + + + # For certain headers we are going to mark them specially in the corpus + # by tagging them with where they were found to help the classifier + # do a better job. So if you have + # + # From: fo...@ba... + # + # then we'll add from:fo...@ba... to the corpus and not just fo...@ba... + + my $prefix = ''; + + if ( $header =~ /^(From|To|Cc|Reply\-To)$/i ) { + + # These headers at least can be decoded + + $argument = $self->decode_string( $argument ); + + if ( $argument =~ /=\?(.{1,40})\?/ ) { + update_word( $self, $1, 0, '', '', 'charset' ); + } + + if ( $header =~ /^From$/i ) { + $encoding = ''; + $self->{content_type} = ''; + $self->{from} = $argument if ( $self->{from} eq '' ) ; + $prefix = 'from'; + } + + if ( $header =~ /^To$/i ) { + $prefix = 'to'; + $self->{to} = $argument if ( $self->{to} eq '' ); + } + + if ( $header =~ /^Cc$/i ) { + $prefix = 'cc'; + $self->{cc} = $argument if ( $self->{cc} eq '' ); + } + + while ( $argument =~ s/<([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))>// ) { + update_word($self, $1, 0, ';', '&',$prefix); + add_url($self, $2, 0, '@', '[&<]',$prefix); + } + + while ( $argument =~ s/([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+))// ) { + update_word($self, $1, 0, '', '',$prefix); + add_url($self, $2, 0, '@', '',$prefix); + } + + add_line( $self, $argument, 0, $prefix ); + return ($mime, $encoding); + } + + if ( $header =~ /^Subject$/i ) { + $prefix = 'subject'; + $argument = $self->decode_string( $argument ); + $self->{subject} = $argument if ( ( $self->{subject} eq '' ) ); + } + + $self->{date} = $argument if ( $header =~ /^Date/i ); + + # Look for MIME + + if ( $header =~ /^Content-Type$/i ) { + + if ( $argument =~ /charset=\"?([^\"]{1,40})\"?/ ) { + update_word( $self, $1, 0, '' , '', 'charset' ); + } + + if ( $argument =~ /^(.*?)(;)/ ) { + print "Set content type to $1\n" if $self->{debug}; + $self->{content_type} = $1; + } + + if ( $argument =~ /multipart\//i ) { + my $boundary = $argument; + + if ( $boundary =~ /boundary= ?(\"([A-Z0-9\'\(\)\+\_\,\-\.\/\:\=\?][A-Z0-9\'\(\)\+_,\-\.\/:=\? ]{0,69})\"|([^\(\)\<\>\@\,\;\:\\\"\/\[\]\?\=]{1,70}))/i ) { + + $boundary = ($2 || $3); + + $boundary =~ s/(.*)/\Q$1\E/g; + + if ($mime ne '') { + + # Fortunately the pipe character isn't a valid mime boundary character! + + $mime = join('|', $mime, $boundary); + } else { + $mime = $boundary; + } + print "Set mime boundary to " . $mime . "\n" if $self->{debug}; + return ($mime, $encoding); + } + } + return ($mime, $encoding); + } + + # Look for the different encodings in a MIME document, when we hit base64 we will + # do a special parse here since words might be broken across the boundaries + + if ( $header =~ /^Content-Transfer-Encoding$/i ) { + $encoding = $argument; + print "Setting encoding to $encoding\n" if $self->{debug}; + my $compact_encoding = $encoding; + $compact_encoding =~ s/[^A-Za-z0-9]//g; + increment_word( $self, "encoding:$compact_encoding" ); + return ($mime, $encoding); + } + + # Some headers to discard + + return ($mime, $encoding) if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)$/i ); + + # Some headers should never be RFC 2047 decoded + + $argument = $self->decode_string($argument) unless ($header =~ /^(Revceived|Content\-Type|Content\-Disposition)$/i); + + add_line( $self, $argument, 0, $prefix ); + + return ($mime, $encoding); + } + + # --------------------------------------------------------------------------------------------- + # # clear_out_base64 # *************** *** 1149,1152 **** --- 1189,1219 ---- } + # --------------------------------------------------------------------------------------------- + # + # splitline - Escapes characters so a line will print as plain-text within a HTML document. + # + # $line The line to escape + # $encoding The value of any current encoding scheme + # + # --------------------------------------------------------------------------------------------- + + sub splitline + { + my ($line, $encoding) = @_; + $line =~ s/([^\r\n]{100,120} )/$1\r\n/g; + $line =~ s/([^ \r\n]{120})/$1\r\n/g; + + $line =~ s/</</g; + $line =~ s/>/>/g; + + if ( $encoding =~ /quoted\-printable/i ) { + $line =~ s/=3C/</g; + $line =~ s/=3E/>/g; + } + + $line =~ s/\t/ /g; + + return $line; + } 1; |