|
From: <ssc...@us...> - 2003-04-11 00:34:00
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv28186
Modified Files:
Tag: v0/18/1
MailParse.pm
Log Message:
Fixes:
695565 Continued cc lines not colorized
702215 POPFile 0.18.1 fails do decode base64 attachments.
702316 0.18.1: nested QP parts - not decoded
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.91.2.2
retrieving revision 1.91.2.3
diff -C2 -d -r1.91.2.2 -r1.91.2.3
*** MailParse.pm 10 Apr 2003 22:11:23 -0000 1.91.2.2
--- MailParse.pm 11 Apr 2003 00:33:57 -0000 1.91.2.3
***************
*** 740,743 ****
--- 740,748 ----
my $encoding = '';
+ # Variables to save header information to while parsing headers
+
+ my $header;
+ my $argument;
+
# Clear the word hash
***************
*** 789,967 ****
print ">>> $line" if $self->{debug};
! if ( $self->{color} ) {
! my $splitline = $line;
! $splitline =~ s/([^\r\n]{100,120} )/$1\r\n/g;
! $splitline =~ s/([^ \r\n]{120})/$1\r\n/g;
! if ( !$self->{in_html_tag} ) {
! $colorized .= $self->{ut} if ( $self->{ut} ne '' );
!
$self->{ut} = '';
! }
!
! #Escape some HTML characters to ensure display in HTML UI
! $splitline =~ s/</</g;
! $splitline =~ s/>/>/g;
!
! if ( $encoding =~ /quoted\-printable/i ) {
! $splitline =~ s/=3C/</g;
! $splitline =~ s/=3E/>/g;
! }
! $splitline =~ s/\t/ /g;
! $self->{ut} .= $splitline;
! }
- if ($self->{in_headers}) {
-
# Check for blank line signifying end of headers
-
- if ( $line =~ /^(\r\n|\r|\n)/) {
- $self->{in_headers} = 0;
- print "Header parsing complete.\n" if $self->{debug};
- }
-
- # If we have an email header then just keep the part after the :
-
- if ( $line =~ /^([A-Za-z-]+):[ \t]*([^\n\r]*)/ ) {
- my $header = $1;
- my $argument = $2;
-
- print "Header ($header) ($argument)\n" if ($self->{debug});
-
- # Handle the From, To and Cc headers and extract email addresses
- # from them and treat them as words
! # For certain headers we are going to mark them specially in the corpus
! # by tagging them with where they were found to help the classifier
! # do a better job. So if you have
! #
! # From: fo...@ba...
! #
! # then we'll add from:fo...@ba... to the corpus and not just fo...@ba...
! my $prefix = '';
!
! if ( $header =~ /^(From|To|Cc|Reply\-To)/i ) {
!
! # Concatenate multi-line fields (To, CC)
!
! if ( ( $header =~ /^To/i ) || ( $header =~ /^Cc/i ) ) {
! my $currpos = tell MSG;
! my $tempread = <MSG>;
! while ( $tempread =~ s/^[ \t]+(.*?)[\r\n]+// ) {
! if ( $1 ne '' ) {
! $argument .= $1;
! $currpos = tell MSG;
! $tempread = <MSG>;
! } else {
! last;
! }
! }
! seek MSG, $currpos, 0;
! print "\n$header: [[$argument]]\n" if $self->{debug};
! }
! if ( $argument =~ /=\?(.{1,40})\?/ ) {
! update_word( $self, $1, 0, '', '', 'charset' );
! }
!
! if ( $header =~ /^From/i ) {
! $encoding = '';
! $self->{content_type} = '';
! $self->{from} = $self->decode_string( $argument ) if ( $self->{from} eq '' ) ;
! $prefix = 'from';
! }
! $prefix = 'to' if ( $header =~ /^To/i );
! $self->{to} = $self->decode_string( $argument ) if ( ( $header =~ /^To/i ) && ( $self->{to} eq '' ) );
!
! $prefix = 'cc' if ( $header =~ /^Cc/i );
! $self->{cc} = $self->decode_string( $argument ) if ( ( $header =~ /^Cc/i ) && ( $self->{cc} eq '' ) );
!
! while ( $argument =~ s/<([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))>// ) {
! update_word($self, $1, 0, ';', '&',$prefix);
! add_url($self, $2, 0, '@', '[&<]',$prefix);
! }
- while ( $argument =~ s/([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+))// ) {
- update_word($self, $1, 0, '', '',$prefix);
- add_url($self, $2, 0, '@', '',$prefix);
- }
-
- add_line( $self, $argument, 0, $prefix );
- next;
- }
-
- $self->{subject} = $self->decode_string( $argument ) if ( ( $header =~ /^Subject/i ) && ( $self->{subject} eq '' ) );
! if ( $header =~ /^Subject/i ) {
! $prefix = 'subject';
! }
! $self->{date} = $argument if ( $header =~ /^Date/i );
! # Look for MIME
! if ( $header =~ /^Content-Type/i ) {
! if ( $argument =~ /multipart\//i ) {
! my $boundary = $argument;
!
! if ( !( $argument =~ /boundary=(\"([A-Z0-9\'\(\)\+\_\,\-\.\/\:\=\?][A-Z0-9\'\(\)\+_,\-\.\/:=\? ]{0,69})\"|([^\(\)\<\>\@\,\;\:\\\"\/\[\]\?\=]{1,70}))/i )) {
! $boundary = <MSG>;
! }
!
! if ( $boundary =~ /boundary=(\"([A-Z0-9\'\(\)\+\_\,\-\.\/\:\=\?][A-Z0-9\'\(\)\+_,\-\.\/:=\? ]{0,69})\"|([^\(\)\<\>\@\,\;\:\\\"\/\[\]\?\=]{1,70}))/i ) {
!
! $boundary = ($2 || $3);
!
! $boundary =~ s/(\+|\/|\?|\*|\||\(|\)|\[|\]|\{|\}|\^|\$|\.)/\\$1/g;
!
! if ($mime ne '') {
!
! # Fortunately the pipe character isn't a valid mime boundary character!
!
! $mime = join('|', $mime, $boundary);
! } else {
! $mime = $boundary;
! }
! print "Set mime boundary to " . $mime . "\n" if $self->{debug};
! next;
! }
! }
! if ( $argument =~ /charset=\"?([^\"]{1,40})\"?/ ) {
! update_word( $self, $1, 0, '' , '', 'charset' );
! }
!
! if ( $argument =~ /^(.*?)(;$)/ ) {
! print "Set content type to $1\n" if $self->{debug};
! $self->{content_type} = $1;
! }
! next;
! }
!
! # Look for the different encodings in a MIME document, when we hit base64 we will
! # do a special parse here since words might be broken across the boundaries
!
! if ( $header =~ /^Content-Transfer-Encoding/i ) {
! $encoding = $argument;
! print "Setting encoding to $encoding\n" if $self->{debug};
! my $compact_encoding = $encoding;
! $compact_encoding =~ s/[^A-Za-z0-9]//g;
! increment_word( $self, "encoding:$compact_encoding" );
! next;
! }
!
! # Some headers to discard
!
! next if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)/i );
!
! add_line( $self, $argument, 0, $prefix );
!
next;
}
}
--- 794,854 ----
print ">>> $line" if $self->{debug};
! if ($self->{color}) {
! if (!$self->{in_html_tag}) {
! $colorized .= $self->{ut};
$self->{ut} = '';
! }
!
! $self->{ut} .= splitline($line, $encoding);
! }
! if ($self->{in_headers}) {
+ # temporary colorization while in headers is handled within parse_header
! $self->{ut} = '';
# Check for blank line signifying end of headers
! if ( $line =~ /^(\r\n|\r|\n)/) {
!
! # Parse the last header
! ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding);
! # Clear the saved headers
! $header = '';
! $argument = '';
!
! $self->{ut} .= splitline("\015\012", 0);
! $self->{in_headers} = 0;
! print "Header parsing complete.\n" if $self->{debug};
! next;
! }
! # If we have an email header then just keep the part after the :
! if ( $line =~ /^([A-Za-z-]+):[ \t]*([^\n\r]*)/ ) {
! # Parse the last header
! ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding) if ($header ne '');
! # Save the new information for the current header
!
! $header = $1;
! $argument = $2;
next;
}
+
+ # Append to argument if the next line begins with whitespace (isn't a new header)
+
+ if ( $line =~ /^([\t ].*?)(\r\n|\r|\n)/ ) {
+ $argument .= "\015\012" . $1;
+ }
+ next;
}
***************
*** 1074,1077 ****
--- 961,1117 ----
# ---------------------------------------------------------------------------------------------
#
+ # parse_header - Performs parsing operations on a message header
+ #
+ # $header Name of header being processed
+ # $argument Value of header being processed
+ # $mime The presently saved mime boundaries list
+ # $encoding Current message encoding
+ #
+ # ---------------------------------------------------------------------------------------------
+ sub parse_header
+ {
+ my ($self, $header, $argument, $mime, $encoding) = @_;
+
+ print "Header ($header) ($argument)\n" if ($self->{debug});
+
+ # Remove over-reading
+ $self->{ut} = '';
+
+ # Qeueue just this header for colorization
+ $self->{ut} = splitline("$header: $argument\015\012", $encoding);
+
+ # Check the encoding type in all RFC 2047 encoded headers
+
+ if ( $argument =~ /=\?(.{1,40})\?(Q|B)/i ) {
+ update_word( $self, $1, 0, '', '', 'charset' );
+ }
+
+ # Handle the From, To and Cc headers and extract email addresses
+ # from them and treat them as words
+
+
+ # For certain headers we are going to mark them specially in the corpus
+ # by tagging them with where they were found to help the classifier
+ # do a better job. So if you have
+ #
+ # From: fo...@ba...
+ #
+ # then we'll add from:fo...@ba... to the corpus and not just fo...@ba...
+
+ my $prefix = '';
+
+ if ( $header =~ /^(From|To|Cc|Reply\-To)$/i ) {
+
+ # These headers at least can be decoded
+
+ $argument = $self->decode_string( $argument );
+
+ if ( $argument =~ /=\?(.{1,40})\?/ ) {
+ update_word( $self, $1, 0, '', '', 'charset' );
+ }
+
+ if ( $header =~ /^From$/i ) {
+ $encoding = '';
+ $self->{content_type} = '';
+ $self->{from} = $argument if ( $self->{from} eq '' ) ;
+ $prefix = 'from';
+ }
+
+ if ( $header =~ /^To$/i ) {
+ $prefix = 'to';
+ $self->{to} = $argument if ( $self->{to} eq '' );
+ }
+
+ if ( $header =~ /^Cc$/i ) {
+ $prefix = 'cc';
+ $self->{cc} = $argument if ( $self->{cc} eq '' );
+ }
+
+ while ( $argument =~ s/<([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))>// ) {
+ update_word($self, $1, 0, ';', '&',$prefix);
+ add_url($self, $2, 0, '@', '[&<]',$prefix);
+ }
+
+ while ( $argument =~ s/([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+))// ) {
+ update_word($self, $1, 0, '', '',$prefix);
+ add_url($self, $2, 0, '@', '',$prefix);
+ }
+
+ add_line( $self, $argument, 0, $prefix );
+ return ($mime, $encoding);
+ }
+
+ if ( $header =~ /^Subject$/i ) {
+ $prefix = 'subject';
+ $argument = $self->decode_string( $argument );
+ $self->{subject} = $argument if ( ( $self->{subject} eq '' ) );
+ }
+
+ $self->{date} = $argument if ( $header =~ /^Date/i );
+
+ # Look for MIME
+
+ if ( $header =~ /^Content-Type$/i ) {
+
+ if ( $argument =~ /charset=\"?([^\"]{1,40})\"?/ ) {
+ update_word( $self, $1, 0, '' , '', 'charset' );
+ }
+
+ if ( $argument =~ /^(.*?)(;)/ ) {
+ print "Set content type to $1\n" if $self->{debug};
+ $self->{content_type} = $1;
+ }
+
+ if ( $argument =~ /multipart\//i ) {
+ my $boundary = $argument;
+
+ if ( $boundary =~ /boundary= ?(\"([A-Z0-9\'\(\)\+\_\,\-\.\/\:\=\?][A-Z0-9\'\(\)\+_,\-\.\/:=\? ]{0,69})\"|([^\(\)\<\>\@\,\;\:\\\"\/\[\]\?\=]{1,70}))/i ) {
+
+ $boundary = ($2 || $3);
+
+ $boundary =~ s/(.*)/\Q$1\E/g;
+
+ if ($mime ne '') {
+
+ # Fortunately the pipe character isn't a valid mime boundary character!
+
+ $mime = join('|', $mime, $boundary);
+ } else {
+ $mime = $boundary;
+ }
+ print "Set mime boundary to " . $mime . "\n" if $self->{debug};
+ return ($mime, $encoding);
+ }
+ }
+ return ($mime, $encoding);
+ }
+
+ # Look for the different encodings in a MIME document, when we hit base64 we will
+ # do a special parse here since words might be broken across the boundaries
+
+ if ( $header =~ /^Content-Transfer-Encoding$/i ) {
+ $encoding = $argument;
+ print "Setting encoding to $encoding\n" if $self->{debug};
+ my $compact_encoding = $encoding;
+ $compact_encoding =~ s/[^A-Za-z0-9]//g;
+ increment_word( $self, "encoding:$compact_encoding" );
+ return ($mime, $encoding);
+ }
+
+ # Some headers to discard
+
+ return ($mime, $encoding) if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)$/i );
+
+ # Some headers should never be RFC 2047 decoded
+
+ $argument = $self->decode_string($argument) unless ($header =~ /^(Revceived|Content\-Type|Content\-Disposition)$/i);
+
+ add_line( $self, $argument, 0, $prefix );
+
+ return ($mime, $encoding);
+ }
+
+ # ---------------------------------------------------------------------------------------------
+ #
# clear_out_base64
#
***************
*** 1149,1152 ****
--- 1189,1219 ----
}
+ # ---------------------------------------------------------------------------------------------
+ #
+ # splitline - Escapes characters so a line will print as plain-text within a HTML document.
+ #
+ # $line The line to escape
+ # $encoding The value of any current encoding scheme
+ #
+ # ---------------------------------------------------------------------------------------------
+
+ sub splitline
+ {
+ my ($line, $encoding) = @_;
+ $line =~ s/([^\r\n]{100,120} )/$1\r\n/g;
+ $line =~ s/([^ \r\n]{120})/$1\r\n/g;
+
+ $line =~ s/</</g;
+ $line =~ s/>/>/g;
+
+ if ( $encoding =~ /quoted\-printable/i ) {
+ $line =~ s/=3C/</g;
+ $line =~ s/=3E/>/g;
+ }
+
+ $line =~ s/\t/ /g;
+
+ return $line;
+ }
1;
|