From: <jgr...@us...> - 2003-04-29 13:22:34
|
Update of /cvsroot/popfile/engine/Classifier In directory sc8-pr-cvs1:/tmp/cvs-serv3735/Classifier Modified Files: MailParse.pm Log Message: Fix bug in HTML parser where tags split across lines got spaces added messing up attribute values; fix bug in invisibleink detection; strip CRLF from quoted-printable decoded lines Index: MailParse.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v retrieving revision 1.120 retrieving revision 1.121 diff -C2 -d -r1.120 -r1.121 *** MailParse.pm 27 Apr 2003 04:01:14 -0000 1.120 --- MailParse.pm 29 Apr 2003 13:22:30 -0000 1.121 *************** *** 154,157 **** --- 154,158 ---- my ( $self, $prefix, $word ) = @_; + print "update_pseudoword:$prefix:$word\n"; $self->increment_word( "$prefix:$word" ); } *************** *** 319,323 **** } } else { ! $self->increment_word( 'trick:invisibleink' ); } } --- 320,326 ---- } } else { ! if ( $bigline ne '' ) { ! $self->increment_word( 'trick:invisibleink' ); ! } } } *************** *** 722,727 **** my $found = 1; ! $line =~ s/[\r\n]+/ /g; ! $line =~ s/[\t ]+$//; print "parse_html: [$line] " . $self->{in_html_tag__} . "\n" if $self->{debug}; --- 725,729 ---- my $found = 1; ! $line =~ s/[\r\n]+//gm; print "parse_html: [$line] " . $self->{in_html_tag__} . "\n" if $self->{debug}; *************** *** 737,742 **** $found = 0; - $line =~ s/^[\t ]+//; - # If we are in an HTML tag then look for the close of the tag, if we get it then # handle the tag, if we don't then keep building up the arguments of the tag --- 739,742 ---- *************** *** 744,748 **** if ( $self->{in_html_tag__} ) { if ( $line =~ s/^(.*?)>// ) { ! $self->{html_arg__} .= ' ' . $1; $self->{in_html_tag__} = 0; $self->{html_tag__} =~ s/=\n ?//g; --- 744,748 ---- if ( $self->{in_html_tag__} ) { if ( $line =~ s/^(.*?)>// ) { ! $self->{html_arg__} .= $1; $self->{in_html_tag__} = 0; $self->{html_tag__} =~ s/=\n ?//g; *************** *** 754,758 **** next; } else { ! $self->{html_arg__} .= ' ' . $line; return 1; } --- 754,758 ---- next; } else { ! $self->{html_arg__} .= $line; return 1; } *************** *** 1012,1015 **** --- 1012,1016 ---- if ( $encoding =~ /quoted\-printable/i ) { $line = decode_qp( $line ); + $line =~ s/[\r\n]+$//g; $self->{ut__} = decode_qp( $self->{ut__} ) if ( $self->{color__} ); } |