From: <ssc...@us...> - 2003-03-29 00:17:14
|
Update of /cvsroot/popfile/engine/Classifier In directory sc8-pr-cvs1:/tmp/cvs-serv9703 Modified Files: MailParse.pm Log Message: generalizes multi-line header parser handling (experimental), includes ideas in patch 694857 Index: MailParse.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v retrieving revision 1.100 retrieving revision 1.101 diff -C2 -d -r1.100 -r1.101 *** MailParse.pm 29 Mar 2003 00:11:50 -0000 1.100 --- MailParse.pm 29 Mar 2003 00:17:08 -0000 1.101 *************** *** 742,745 **** --- 742,750 ---- my $encoding = ''; + + # Variables to save header information to while parsing headers + + my $header; + my $argument; # Clear the word hash *************** *** 815,819 **** --- 820,828 ---- } + # Parse the last header + + ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding); $splitline =~ s/\t/ /g; + $self->{ut__} .= $splitline; } *************** *** 826,971 **** $self->{in_headers__} = 0; print "Header parsing complete.\n" if $self->{debug}; ! } # If we have an email header then just keep the part after the : if ( $line =~ /^([A-Za-z-]+):[ \t]*([^\n\r]*)/ ) { ! my $header = $1; ! my $argument = $2; ! ! print "Header ($header) ($argument)\n" if ($self->{debug}); ! ! # Handle the From, To and Cc headers and extract email addresses ! # from them and treat them as words ! ! # For certain headers we are going to mark them specially in the corpus ! # by tagging them with where they were found to help the classifier ! # do a better job. So if you have ! # ! # From: fo...@ba... ! # ! # then we'll add from:fo...@ba... to the corpus and not just fo...@ba... ! ! my $prefix = ''; ! ! if ( $header =~ /^(From|To|Cc|Reply\-To)/i ) { ! ! # Concatenate multi-line fields (To, CC) ! ! if ( ( $header =~ /^To/i ) || ( $header =~ /^Cc/i ) ) { ! my $currpos = tell MSG; ! my $tempread = <MSG>; ! while ( $tempread =~ s/^[ \t]+(.*?)[\r\n]+// ) { ! if ( $1 ne '' ) { ! $argument .= $1; ! $currpos = tell MSG; ! $tempread = <MSG>; ! } else { ! last; ! } ! } ! seek MSG, $currpos, 0; ! print "\n$header: [[$argument]]\n" if $self->{debug}; ! } ! ! if ( $argument =~ /=\?(.{1,40})\?/ ) { ! update_word( $self, $1, 0, '', '', 'charset' ); ! } ! ! if ( $header =~ /^From/i ) { ! $encoding = ''; ! $self->{content_type__} = ''; ! $self->{from__} = $self->decode_string( $argument ) if ( $self->{from__} eq '' ) ; ! $prefix = 'from'; ! } ! ! $prefix = 'to' if ( $header =~ /^To/i ); ! $self->{to__} = $self->decode_string( $argument ) if ( ( $header =~ /^To/i ) && ( $self->{to__} eq '' ) ); ! ! $prefix = 'cc' if ( $header =~ /^Cc/i ); ! $self->{cc__} = $self->decode_string( $argument ) if ( ( $header =~ /^Cc/i ) && ( $self->{cc__} eq '' ) ); ! ! while ( $argument =~ s/<([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))>// ) { ! update_word($self, $1, 0, ';', '&',$prefix); ! add_url($self, $2, 0, '@', '[&<]',$prefix); ! } ! ! while ( $argument =~ s/([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+))// ) { ! update_word($self, $1, 0, '', '',$prefix); ! add_url($self, $2, 0, '@', '',$prefix); ! } ! ! add_line( $self, $argument, 0, $prefix ); ! next; ! } ! ! $self->{subject__} = $self->decode_string( $argument ) if ( ( $header =~ /^Subject/i ) && ( $self->{subject__} eq '' ) ); ! ! if ( $header =~ /^Subject/i ) { ! $prefix = 'subject'; ! } ! ! $self->{date__} = $argument if ( $header =~ /^Date/i ); ! ! # Look for MIME ! ! if ( $header =~ /^Content-Type/i ) { ! if ( $argument =~ /multipart\//i ) { ! my $boundary = $argument; ! ! if ( !( $argument =~ /boundary=(\"([A-Z0-9\'\(\)\+\_\,\-\.\/\:\=\?][A-Z0-9\'\(\)\+_,\-\.\/:=\? ]{0,69})\"|([^\(\)\<\>\@\,\;\:\\\"\/\[\]\?\=]{1,70}))/i )) { ! $boundary = <MSG>; ! } ! ! if ( $boundary =~ /boundary=(\"([A-Z0-9\'\(\)\+\_\,\-\.\/\:\=\?][A-Z0-9\'\(\)\+_,\-\.\/:=\? ]{0,69})\"|([^\(\)\<\>\@\,\;\:\\\"\/\[\]\?\=]{1,70}))/i ) { ! ! $boundary = ($2 || $3); ! ! $boundary =~ s/(\+|\/|\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.)/\\$1/g; ! ! if ($mime ne '') { ! ! # Fortunately the pipe character isn't a valid mime boundary character! ! ! $mime = join('|', $mime, $boundary); ! } else { ! $mime = $boundary; ! } ! print "Set mime boundary to " . $mime . "\n" if $self->{debug}; ! next; ! } ! } ! ! if ( $argument =~ /charset=\"?([^\"]{1,40})\"?/ ) { ! update_word( $self, $1, 0, '' , '', 'charset' ); ! } ! ! if ( $argument =~ /^(.*?)(;$)/ ) { ! print "Set content type to $1\n" if $self->{debug}; ! $self->{content_type__} = $1; ! } ! next; ! } ! ! # Look for the different encodings in a MIME document, when we hit base64 we will ! # do a special parse here since words might be broken across the boundaries ! ! if ( $header =~ /^Content-Transfer-Encoding/i ) { ! $encoding = $argument; ! print "Setting encoding to $encoding\n" if $self->{debug}; ! my $compact_encoding = $encoding; ! $compact_encoding =~ s/[^A-Za-z0-9]//g; ! increment_word( $self, "encoding:$compact_encoding" ); ! next; ! } ! ! # Some headers to discard ! ! next if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)/i ); ! ! add_line( $self, $argument, 0, $prefix ); ! ! next; } } --- 835,860 ---- $self->{in_headers__} = 0; print "Header parsing complete.\n" if $self->{debug}; ! } # If we have an email header then just keep the part after the : if ( $line =~ /^([A-Za-z-]+):[ \t]*([^\n\r]*)/ ) { ! ! # Parse the last header ! ! ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding); ! ! # Save the new information for the current header ! ! $header = $1; ! $argument = $2; } + + # Append to argument if the next line begins with whitespace (isn't a new header) + + if ( $line =~ /^[\t ](.*?)(\r\n|\r|\n)/ ) { + $argument .= $1; + } + next; } *************** *** 991,995 **** # escape to match escaped boundary characters ! $boundary =~ s/(\+|\/|\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.)/\\$1/g; my $temp_mime; --- 880,884 ---- # escape to match escaped boundary characters ! $boundary =~ s/(.*)/\Q$1\E/g; my $temp_mime; *************** *** 1166,1169 **** --- 1055,1188 ---- return $self->{$header . '__'}; } + + + # --------------------------------------------------------------------------------------------- + # + # parse_header - Performs parsing operations on a message header + # + # $header Name of header being processed + # $argument Value of header being processed + # $mime The presently saved mime boundaries list + # $encoding Current message encoding + # + # --------------------------------------------------------------------------------------------- + sub parse_header + { + my ($self, $header, $argument, $mime, $encoding ) = @_; + + print "Header ($header) ($argument)\n" if ($self->{debug}); + + # Handle the From, To and Cc headers and extract email addresses + # from them and treat them as words + + # For certain headers we are going to mark them specially in the corpus + # by tagging them with where they were found to help the classifier + # do a better job. So if you have + # + # From: fo...@ba... + # + # then we'll add from:fo...@ba... to the corpus and not just fo...@ba... + + my $prefix = ''; + + if ( $header =~ /^(From|To|Cc|Reply\-To)/i ) { + + if ( $argument =~ /=\?(.{1,40})\?/ ) { + update_word( $self, $1, 0, '', '', 'charset' ); + } + + if ( $header =~ /^From/i ) { + $encoding = ''; + $self->{content_type__} = ''; + $self->{from__} = $self->decode_string( $argument ) if ( $self->{from__} eq '' ) ; + $prefix = 'from'; + } + + $prefix = 'to' if ( $header =~ /^To/i ); + $self->{to__} = $self->decode_string( $argument ) if ( ( $header =~ /^To/i ) && ( $self->{to__} eq '' ) ); + + $prefix = 'cc' if ( $header =~ /^Cc/i ); + $self->{cc__} = $self->decode_string( $argument ) if ( ( $header =~ /^Cc/i ) && ( $self->{cc__} eq '' ) ); + + while ( $argument =~ s/<([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))>// ) { + update_word($self, $1, 0, ';', '&',$prefix); + add_url($self, $2, 0, '@', '[&<]',$prefix); + } + + while ( $argument =~ s/([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+))// ) { + update_word($self, $1, 0, '', '',$prefix); + add_url($self, $2, 0, '@', '',$prefix); + } + + add_line( $self, $argument, 0, $prefix ); + return ($mime, $encoding); + } + + $self->{subject__} = $self->decode_string( $argument ) if ( ( $header =~ /^Subject/i ) && ( $self->{subject__} eq '' ) ); + + if ( $header =~ /^Subject/i ) { + $prefix = 'subject'; + } + + $self->{date__} = $argument if ( $header =~ /^Date/i ); + + # Look for MIME + + if ( $header =~ /^Content-Type/i ) { + + if ( $argument =~ /charset=\"?([^\"]{1,40})\"?/ ) { + update_word( $self, $1, 0, '' , '', 'charset' ); + } + + if ( $argument =~ /^(.*?)(;)/ ) { + print "Set content type to $1\n" if $self->{debug}; + $self->{content_type__} = $1; + } + + if ( $argument =~ /multipart\//i ) { + my $boundary = $argument; + + if ( $boundary =~ /boundary= ?(\"([A-Z0-9\'\(\)\+\_\,\-\.\/\:\=\?][A-Z0-9\'\(\)\+_,\-\.\/:=\? ]{0,69})\"|([^\(\)\<\>\@\,\;\:\\\"\/\[\]\?\=]{1,70}))/i ) { + + $boundary = ($2 || $3); + + $boundary =~ s/(.*)/\Q$1\E/g; + + if ($mime ne '') { + + # Fortunately the pipe character isn't a valid mime boundary character! + + $mime = join('|', $mime, $boundary); + } else { + $mime = $boundary; + } + print "Set mime boundary to " . $mime . "\n" if $self->{debug}; + return ($mime, $encoding); + } + } + return ($mime, $encoding); + } + + # Look for the different encodings in a MIME document, when we hit base64 we will + # do a special parse here since words might be broken across the boundaries + + if ( $header =~ /^Content-Transfer-Encoding/i ) { + $encoding = $argument; + print "Setting encoding to $encoding\n" if $self->{debug}; + my $compact_encoding = $encoding; + $compact_encoding =~ s/[^A-Za-z0-9]//g; + increment_word( $self, "encoding:$compact_encoding" ); + return ($mime, $encoding); + } + + # Some headers to discard + + return ($mime, $encoding) if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)/i ); + + add_line( $self, $argument, 0, $prefix ); + + return ($mime, $encoding); + } + # GETTERS/SETTERS |