From: <ssc...@us...> - 2003-04-07 20:36:59
|
Update of /cvsroot/popfile/engine/Classifier In directory sc8-pr-cvs1:/tmp/cvs-serv31327 Modified Files: MailParse.pm Log Message: complete multi-line header parsing to colorize correctly Index: MailParse.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v retrieving revision 1.104 retrieving revision 1.105 diff -C2 -d -r1.104 -r1.105 *** MailParse.pm 7 Apr 2003 17:55:46 -0000 1.104 --- MailParse.pm 7 Apr 2003 20:36:52 -0000 1.105 *************** *** 371,375 **** $value = $3; $quote = ''; ! $end_quote = '[\> \t\&]'; if (defined $2) { $quote = $2; --- 371,375 ---- $value = $3; $quote = ''; ! $end_quote = '[\> \t\&\n]'; if (defined $2) { $quote = $2; *************** *** 678,682 **** $line =~ s/[\t ]+$//; ! print "parse_html: [$line] $self->{in_html_tag__}\n" if $self->{debug}; # Remove HTML comments and other tags that begin ! --- 678,682 ---- $line =~ s/[\t ]+$//; ! print "parse_html: [$line] " . $self->{in_html_tag__} . "\n" if $self->{debug}; # Remove HTML comments and other tags that begin ! *************** *** 783,788 **** $self->{base64__} = ''; ! $self->{in_html_tag__} = 0; $self->{html_tag__} = ''; $self->{html_arg__} = ''; --- 783,793 ---- $self->{base64__} = ''; ! ! # Variable to note that the temporary colorized storage is "frozen", ! # and what type of freeze it is (allows nesting of reasons to freeze ! # colorization) ! $self->{in_html_tag__} = 0; + $self->{html_tag__} = ''; $self->{html_arg__} = ''; *************** *** 823,870 **** print ">>> $line" if $self->{debug}; ! ! if ( $self->{color__} ) { ! my $splitline = $line; ! $splitline =~ s/([^\r\n]{100,120} )/$1\r\n/g; ! $splitline =~ s/([^ \r\n]{120})/$1\r\n/g; ! ! if ( !$self->{in_html_tag__} ) { ! $colorized .= $self->{ut__} if ( $self->{ut__} ne '' ); ! ! $self->{ut__} = ''; ! } ! ! $splitline =~ s/</</g; ! $splitline =~ s/>/>/g; ! ! #TODO: regress patch to 0.18.1 ! if ( $encoding =~ /quoted\-printable/i ) { ! $splitline =~ s/=3C/</g; ! $splitline =~ s/=3E/>/g; ! } ! ! $splitline =~ s/\t/ /g; ! ! $self->{ut__} .= $splitline; ! } if ($self->{in_headers__}) { # Check for blank line signifying end of headers if ( $line =~ /^(\r\n|\r|\n)/) { ! ! # Parse the last header ! ! ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding); # Clear the saved headers $header = ''; $argument = ''; $self->{in_headers__} = 0; print "Header parsing complete.\n" if $self->{debug}; } # If we have an email header then just keep the part after the : --- 828,863 ---- print ">>> $line" if $self->{debug}; ! ! $colorized .= $self->{ut__}; ! $self->{ut__} = ''; ! ! $self->{ut__} .= splitline($line, $encoding); if ($self->{in_headers__}) { + + # temporary colorization while in headers is handled within parse_header + + $self->{ut__} = ''; # Check for blank line signifying end of headers if ( $line =~ /^(\r\n|\r|\n)/) { ! ! # Parse the last header ! ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding); # Clear the saved headers $header = ''; $argument = ''; + + $self->{ut__} .= splitline("\015\012", 0); $self->{in_headers__} = 0; print "Header parsing complete.\n" if $self->{debug}; + + next; } + # If we have an email header then just keep the part after the : *************** *** 872,877 **** # Parse the last header ! ! ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding); # Save the new information for the current header --- 865,870 ---- # Parse the last header ! ! ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding) if ($header ne ''); # Save the new information for the current header *************** *** 879,888 **** $header = $1; $argument = $2; } # Append to argument if the next line begins with whitespace (isn't a new header) ! ! if ( $line =~ /^[\t ](.*?)(\r\n|\r|\n)/ ) { ! $argument .= $1; } next; --- 872,882 ---- $header = $1; $argument = $2; + next; } # Append to argument if the next line begins with whitespace (isn't a new header) ! ! if ( $line =~ /^([\t ].*?)(\r\n|\r|\n)/ ) { ! $argument .= "\015\012" . $1; } next; *************** *** 982,985 **** --- 976,981 ---- $colorized .= clear_out_base64( $self ); close MSG; + + $self->{in_html_tag__} = 0; if ( $self->{color__} ) { *************** *** 1099,1105 **** sub parse_header { ! my ($self, $header, $argument, $mime, $encoding ) = @_; print "Header ($header) ($argument)\n" if ($self->{debug}); # After a discussion with Tim Peters and some looking at emails --- 1095,1107 ---- sub parse_header { ! my ($self, $header, $argument, $mime, $encoding) = @_; print "Header ($header) ($argument)\n" if ($self->{debug}); + + # Remove over-reading + $self->{ut__} = ''; + + # Qeueue just this header for colorization + $self->{ut__} = splitline("$header: $argument\015\012", $encoding); # After a discussion with Tim Peters and some looking at emails *************** *** 1112,1115 **** --- 1114,1118 ---- # Handle the From, To and Cc headers and extract email addresses # from them and treat them as words + # For certain headers we are going to mark them specially in the corpus *************** *** 1221,1225 **** return ($mime, $encoding); } ! # GETTERS/SETTERS --- 1224,1256 ---- return ($mime, $encoding); } ! ! ! # --------------------------------------------------------------------------------------------- ! # ! # splitline - Escapes characters so a line will print as plain-text within a HTML document. ! # ! # $line The line to escape ! # $encoding The value of any current encoding scheme ! # ! # --------------------------------------------------------------------------------------------- ! ! sub splitline ! { ! my ($line, $encoding) = @_; ! $line =~ s/([^\r\n]{100,120} )/$1\r\n/g; ! $line =~ s/([^ \r\n]{120})/$1\r\n/g; ! ! $line =~ s/</</g; ! $line =~ s/>/>/g; ! ! if ( $encoding =~ /quoted\-printable/i ) { ! $line =~ s/=3C/</g; ! $line =~ s/=3E/>/g; ! } ! ! $line =~ s/\t/ /g; ! ! return $line; ! } # GETTERS/SETTERS *************** *** 1231,1233 **** --- 1262,1269 ---- return $self->{first20__}; } + + + 1; + + |