From: <ssc...@us...> - 2003-04-08 03:16:47
|
Update of /cvsroot/popfile/engine/Classifier In directory sc8-pr-cvs1:/tmp/cvs-serv14429 Modified Files: MailParse.pm Log Message: Fixes [ 696986 ] 0.18.1: Underscores in QP encoded headers, misc other header optimizations Index: MailParse.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v retrieving revision 1.105 retrieving revision 1.106 diff -C2 -d -r1.105 -r1.106 *** MailParse.pm 7 Apr 2003 20:36:52 -0000 1.105 --- MailParse.pm 8 Apr 2003 03:16:44 -0000 1.106 *************** *** 15,18 **** --- 15,19 ---- use MIME::Base64; use MIME::QuotedPrint; + require Encode::MIME::Header; # HTML entity mapping to character codes, this maps things like & to their corresponding *************** *** 1053,1068 **** my ( $self, $mystring ) = @_; ! my $decode_it = ''; ! if ( $mystring =~ /=\?[\w-]+\?B\?(.*)\?=/i ) { ! $decode_it = decode_base64( $1 ); ! $mystring =~ s/=\?[\w-]+\?B\?(.*)\?=/$decode_it/i; ! } else { ! if ( $mystring =~ /=\?[\w-]+\?Q\?(.*)\?=/i ) { ! $decode_it = decode_qp( $1 ); ! $mystring =~ s/=\?[\w-]+\?Q\?(.*)\?=/$decode_it/i; ! } ! } ! return $mystring; } --- 1054,1073 ---- my ( $self, $mystring ) = @_; ! ! $mystring = Encode::MIME::Header::decode($Encode::Encoding{'MIME-Header'},$mystring); ! ! #my $decode_it = ''; ! #while ( $mystring =~ /=\?[\w-]+\?B\?(.*)\?=/ig ) { ! # $decode_it = decode_base64( $1 ); ! # $mystring =~ s/=\?[\w-]+\?B\?(.*)\?=/$decode_it/i; ! #} ! #while ( $mystring =~ /=\?[\w-]+\?Q\?(.*)\?=/ig ) { ! # $decode_it = $1; ! # $decode_it =~ s/\_/=20/g; ! # $decode_it = decode_qp( $decode_it ); ! # $mystring =~ s/=\?[\w-]+\?Q\?(.*)\?=/$decode_it/i; ! # } ! return $mystring; } *************** *** 1112,1115 **** --- 1117,1126 ---- $self->update_pseudoword( 'header', $header ); + # Check the encoding type in all RFC 2047 encoded headers + + if ( $argument =~ /=\?(.{1,40})\?(Q|B)/i ) { + update_word( $self, $1, 0, '', '', 'charset' ); + } + # Handle the From, To and Cc headers and extract email addresses # from them and treat them as words *************** *** 1126,1130 **** my $prefix = ''; ! if ( $header =~ /^(From|To|Cc|Reply\-To)/i ) { if ( $argument =~ /=\?(.{1,40})\?/ ) { --- 1137,1145 ---- my $prefix = ''; ! if ( $header =~ /^(From|To|Cc|Reply\-To)$/i ) { ! ! # These headers at least can be decoded ! ! $argument = $self->decode_string( $argument ); if ( $argument =~ /=\?(.{1,40})\?/ ) { *************** *** 1132,1147 **** } ! if ( $header =~ /^From/i ) { $encoding = ''; ! $self->{content_type__} = ''; ! $self->{from__} = $self->decode_string( $argument ) if ( $self->{from__} eq '' ) ; $prefix = 'from'; } ! $prefix = 'to' if ( $header =~ /^To/i ); ! $self->{to__} = $self->decode_string( $argument ) if ( ( $header =~ /^To/i ) && ( $self->{to__} eq '' ) ); ! $prefix = 'cc' if ( $header =~ /^Cc/i ); ! $self->{cc__} = $self->decode_string( $argument ) if ( ( $header =~ /^Cc/i ) && ( $self->{cc__} eq '' ) ); while ( $argument =~ s/<([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))>// ) { --- 1147,1166 ---- } ! if ( $header =~ /^From$/i ) { $encoding = ''; ! $self->{content_type__} = ''; ! $self->{from__} = $argument if ( $self->{from__} eq '' ) ; $prefix = 'from'; } ! if ( $header =~ /^To$/i ) { ! $prefix = 'to'; ! $self->{to__} = $argument if ( $self->{to__} eq '' ); ! } ! if ( $header =~ /^Cc$/i ) { ! $prefix = 'cc'; ! $self->{cc__} = $argument if ( $self->{cc__} eq '' ); ! } while ( $argument =~ s/<([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))>// ) { *************** *** 1159,1166 **** } ! $self->{subject__} = $self->decode_string( $argument ) if ( ( $header =~ /^Subject/i ) && ( $self->{subject__} eq '' ) ); ! ! if ( $header =~ /^Subject/i ) { $prefix = 'subject'; } --- 1178,1185 ---- } ! if ( $header =~ /^Subject$/i ) { $prefix = 'subject'; + $argument = $self->decode_string( $argument ); + $self->{subject__} = $argument if ( ( $self->{subject__} eq '' ) ); } *************** *** 1169,1173 **** # Look for MIME ! if ( $header =~ /^Content-Type/i ) { if ( $argument =~ /charset=\"?([^\"]{1,40})\"?/ ) { --- 1188,1192 ---- # Look for MIME ! if ( $header =~ /^Content-Type$/i ) { if ( $argument =~ /charset=\"?([^\"]{1,40})\"?/ ) { *************** *** 1207,1211 **** # do a special parse here since words might be broken across the boundaries ! if ( $header =~ /^Content-Transfer-Encoding/i ) { $encoding = $argument; print "Setting encoding to $encoding\n" if $self->{debug}; --- 1226,1230 ---- # do a special parse here since words might be broken across the boundaries ! if ( $header =~ /^Content-Transfer-Encoding$/i ) { $encoding = $argument; print "Setting encoding to $encoding\n" if $self->{debug}; *************** *** 1218,1223 **** # Some headers to discard ! return ($mime, $encoding) if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)/i ); ! add_line( $self, $argument, 0, $prefix ); --- 1237,1246 ---- # Some headers to discard ! return ($mime, $encoding) if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)$/i ); ! ! # Some headers should never be RFC 2047 decoded ! ! $argument = $self->decode_string($argument) unless ($header =~ /^(Revceived|Content\-Type|Content\-Disposition)$/i); ! add_line( $self, $argument, 0, $prefix ); |