|
From: <ssc...@us...> - 2003-03-29 00:17:14
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv9703
Modified Files:
MailParse.pm
Log Message:
generalizes multi-line header parser handling (experimental), includes ideas in patch 694857
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.100
retrieving revision 1.101
diff -C2 -d -r1.100 -r1.101
*** MailParse.pm 29 Mar 2003 00:11:50 -0000 1.100
--- MailParse.pm 29 Mar 2003 00:17:08 -0000 1.101
***************
*** 742,745 ****
--- 742,750 ----
my $encoding = '';
+
+ # Variables to save header information to while parsing headers
+
+ my $header;
+ my $argument;
# Clear the word hash
***************
*** 815,819 ****
--- 820,828 ----
}
+ # Parse the last header
+
+ ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding);
$splitline =~ s/\t/ /g;
+
$self->{ut__} .= $splitline;
}
***************
*** 826,971 ****
$self->{in_headers__} = 0;
print "Header parsing complete.\n" if $self->{debug};
! }
# If we have an email header then just keep the part after the :
if ( $line =~ /^([A-Za-z-]+):[ \t]*([^\n\r]*)/ ) {
! my $header = $1;
! my $argument = $2;
!
! print "Header ($header) ($argument)\n" if ($self->{debug});
!
! # Handle the From, To and Cc headers and extract email addresses
! # from them and treat them as words
!
! # For certain headers we are going to mark them specially in the corpus
! # by tagging them with where they were found to help the classifier
! # do a better job. So if you have
! #
! # From: fo...@ba...
! #
! # then we'll add from:fo...@ba... to the corpus and not just fo...@ba...
!
! my $prefix = '';
!
! if ( $header =~ /^(From|To|Cc|Reply\-To)/i ) {
!
! # Concatenate multi-line fields (To, CC)
!
! if ( ( $header =~ /^To/i ) || ( $header =~ /^Cc/i ) ) {
! my $currpos = tell MSG;
! my $tempread = <MSG>;
! while ( $tempread =~ s/^[ \t]+(.*?)[\r\n]+// ) {
! if ( $1 ne '' ) {
! $argument .= $1;
! $currpos = tell MSG;
! $tempread = <MSG>;
! } else {
! last;
! }
! }
! seek MSG, $currpos, 0;
! print "\n$header: [[$argument]]\n" if $self->{debug};
! }
!
! if ( $argument =~ /=\?(.{1,40})\?/ ) {
! update_word( $self, $1, 0, '', '', 'charset' );
! }
!
! if ( $header =~ /^From/i ) {
! $encoding = '';
! $self->{content_type__} = '';
! $self->{from__} = $self->decode_string( $argument ) if ( $self->{from__} eq '' ) ;
! $prefix = 'from';
! }
!
! $prefix = 'to' if ( $header =~ /^To/i );
! $self->{to__} = $self->decode_string( $argument ) if ( ( $header =~ /^To/i ) && ( $self->{to__} eq '' ) );
!
! $prefix = 'cc' if ( $header =~ /^Cc/i );
! $self->{cc__} = $self->decode_string( $argument ) if ( ( $header =~ /^Cc/i ) && ( $self->{cc__} eq '' ) );
!
! while ( $argument =~ s/<([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))>// ) {
! update_word($self, $1, 0, ';', '&',$prefix);
! add_url($self, $2, 0, '@', '[&<]',$prefix);
! }
!
! while ( $argument =~ s/([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+))// ) {
! update_word($self, $1, 0, '', '',$prefix);
! add_url($self, $2, 0, '@', '',$prefix);
! }
!
! add_line( $self, $argument, 0, $prefix );
! next;
! }
!
! $self->{subject__} = $self->decode_string( $argument ) if ( ( $header =~ /^Subject/i ) && ( $self->{subject__} eq '' ) );
!
! if ( $header =~ /^Subject/i ) {
! $prefix = 'subject';
! }
!
! $self->{date__} = $argument if ( $header =~ /^Date/i );
!
! # Look for MIME
!
! if ( $header =~ /^Content-Type/i ) {
! if ( $argument =~ /multipart\//i ) {
! my $boundary = $argument;
!
! if ( !( $argument =~ /boundary=(\"([A-Z0-9\'\(\)\+\_\,\-\.\/\:\=\?][A-Z0-9\'\(\)\+_,\-\.\/:=\? ]{0,69})\"|([^\(\)\<\>\@\,\;\:\\\"\/\[\]\?\=]{1,70}))/i )) {
! $boundary = <MSG>;
! }
!
! if ( $boundary =~ /boundary=(\"([A-Z0-9\'\(\)\+\_\,\-\.\/\:\=\?][A-Z0-9\'\(\)\+_,\-\.\/:=\? ]{0,69})\"|([^\(\)\<\>\@\,\;\:\\\"\/\[\]\?\=]{1,70}))/i ) {
!
! $boundary = ($2 || $3);
!
! $boundary =~ s/(\+|\/|\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.)/\\$1/g;
!
! if ($mime ne '') {
!
! # Fortunately the pipe character isn't a valid mime boundary character!
!
! $mime = join('|', $mime, $boundary);
! } else {
! $mime = $boundary;
! }
! print "Set mime boundary to " . $mime . "\n" if $self->{debug};
! next;
! }
! }
!
! if ( $argument =~ /charset=\"?([^\"]{1,40})\"?/ ) {
! update_word( $self, $1, 0, '' , '', 'charset' );
! }
!
! if ( $argument =~ /^(.*?)(;$)/ ) {
! print "Set content type to $1\n" if $self->{debug};
! $self->{content_type__} = $1;
! }
! next;
! }
!
! # Look for the different encodings in a MIME document, when we hit base64 we will
! # do a special parse here since words might be broken across the boundaries
!
! if ( $header =~ /^Content-Transfer-Encoding/i ) {
! $encoding = $argument;
! print "Setting encoding to $encoding\n" if $self->{debug};
! my $compact_encoding = $encoding;
! $compact_encoding =~ s/[^A-Za-z0-9]//g;
! increment_word( $self, "encoding:$compact_encoding" );
! next;
! }
!
! # Some headers to discard
!
! next if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)/i );
!
! add_line( $self, $argument, 0, $prefix );
!
! next;
}
}
--- 835,860 ----
$self->{in_headers__} = 0;
print "Header parsing complete.\n" if $self->{debug};
! }
# If we have an email header then just keep the part after the :
if ( $line =~ /^([A-Za-z-]+):[ \t]*([^\n\r]*)/ ) {
!
! # Parse the last header
!
! ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding);
!
! # Save the new information for the current header
!
! $header = $1;
! $argument = $2;
}
+
+ # Append to argument if the next line begins with whitespace (isn't a new header)
+
+ if ( $line =~ /^[\t ](.*?)(\r\n|\r|\n)/ ) {
+ $argument .= $1;
+ }
+ next;
}
***************
*** 991,995 ****
# escape to match escaped boundary characters
! $boundary =~ s/(\+|\/|\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.)/\\$1/g;
my $temp_mime;
--- 880,884 ----
# escape to match escaped boundary characters
! $boundary =~ s/(.*)/\Q$1\E/g;
my $temp_mime;
***************
*** 1166,1169 ****
--- 1055,1188 ----
return $self->{$header . '__'};
}
+
+
+ # ---------------------------------------------------------------------------------------------
+ #
+ # parse_header - Performs parsing operations on a message header
+ #
+ # $header Name of header being processed
+ # $argument Value of header being processed
+ # $mime The presently saved mime boundaries list
+ # $encoding Current message encoding
+ #
+ # ---------------------------------------------------------------------------------------------
+ sub parse_header
+ {
+ my ($self, $header, $argument, $mime, $encoding ) = @_;
+
+ print "Header ($header) ($argument)\n" if ($self->{debug});
+
+ # Handle the From, To and Cc headers and extract email addresses
+ # from them and treat them as words
+
+ # For certain headers we are going to mark them specially in the corpus
+ # by tagging them with where they were found to help the classifier
+ # do a better job. So if you have
+ #
+ # From: fo...@ba...
+ #
+ # then we'll add from:fo...@ba... to the corpus and not just fo...@ba...
+
+ my $prefix = '';
+
+ if ( $header =~ /^(From|To|Cc|Reply\-To)/i ) {
+
+ if ( $argument =~ /=\?(.{1,40})\?/ ) {
+ update_word( $self, $1, 0, '', '', 'charset' );
+ }
+
+ if ( $header =~ /^From/i ) {
+ $encoding = '';
+ $self->{content_type__} = '';
+ $self->{from__} = $self->decode_string( $argument ) if ( $self->{from__} eq '' ) ;
+ $prefix = 'from';
+ }
+
+ $prefix = 'to' if ( $header =~ /^To/i );
+ $self->{to__} = $self->decode_string( $argument ) if ( ( $header =~ /^To/i ) && ( $self->{to__} eq '' ) );
+
+ $prefix = 'cc' if ( $header =~ /^Cc/i );
+ $self->{cc__} = $self->decode_string( $argument ) if ( ( $header =~ /^Cc/i ) && ( $self->{cc__} eq '' ) );
+
+ while ( $argument =~ s/<([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))>// ) {
+ update_word($self, $1, 0, ';', '&',$prefix);
+ add_url($self, $2, 0, '@', '[&<]',$prefix);
+ }
+
+ while ( $argument =~ s/([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+))// ) {
+ update_word($self, $1, 0, '', '',$prefix);
+ add_url($self, $2, 0, '@', '',$prefix);
+ }
+
+ add_line( $self, $argument, 0, $prefix );
+ return ($mime, $encoding);
+ }
+
+ $self->{subject__} = $self->decode_string( $argument ) if ( ( $header =~ /^Subject/i ) && ( $self->{subject__} eq '' ) );
+
+ if ( $header =~ /^Subject/i ) {
+ $prefix = 'subject';
+ }
+
+ $self->{date__} = $argument if ( $header =~ /^Date/i );
+
+ # Look for MIME
+
+ if ( $header =~ /^Content-Type/i ) {
+
+ if ( $argument =~ /charset=\"?([^\"]{1,40})\"?/ ) {
+ update_word( $self, $1, 0, '' , '', 'charset' );
+ }
+
+ if ( $argument =~ /^(.*?)(;)/ ) {
+ print "Set content type to $1\n" if $self->{debug};
+ $self->{content_type__} = $1;
+ }
+
+ if ( $argument =~ /multipart\//i ) {
+ my $boundary = $argument;
+
+ if ( $boundary =~ /boundary= ?(\"([A-Z0-9\'\(\)\+\_\,\-\.\/\:\=\?][A-Z0-9\'\(\)\+_,\-\.\/:=\? ]{0,69})\"|([^\(\)\<\>\@\,\;\:\\\"\/\[\]\?\=]{1,70}))/i ) {
+
+ $boundary = ($2 || $3);
+
+ $boundary =~ s/(.*)/\Q$1\E/g;
+
+ if ($mime ne '') {
+
+ # Fortunately the pipe character isn't a valid mime boundary character!
+
+ $mime = join('|', $mime, $boundary);
+ } else {
+ $mime = $boundary;
+ }
+ print "Set mime boundary to " . $mime . "\n" if $self->{debug};
+ return ($mime, $encoding);
+ }
+ }
+ return ($mime, $encoding);
+ }
+
+ # Look for the different encodings in a MIME document, when we hit base64 we will
+ # do a special parse here since words might be broken across the boundaries
+
+ if ( $header =~ /^Content-Transfer-Encoding/i ) {
+ $encoding = $argument;
+ print "Setting encoding to $encoding\n" if $self->{debug};
+ my $compact_encoding = $encoding;
+ $compact_encoding =~ s/[^A-Za-z0-9]//g;
+ increment_word( $self, "encoding:$compact_encoding" );
+ return ($mime, $encoding);
+ }
+
+ # Some headers to discard
+
+ return ($mime, $encoding) if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)/i );
+
+ add_line( $self, $argument, 0, $prefix );
+
+ return ($mime, $encoding);
+ }
+
# GETTERS/SETTERS
|