|
From: <jgr...@us...> - 2003-04-29 13:22:34
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv3735/Classifier
Modified Files:
MailParse.pm
Log Message:
Fix bug in HTML parser where tags split across lines got spaces added messing up attribute values; fix bug in invisibleink detection; strip CRLF from quoted-printable decoded lines
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.120
retrieving revision 1.121
diff -C2 -d -r1.120 -r1.121
*** MailParse.pm 27 Apr 2003 04:01:14 -0000 1.120
--- MailParse.pm 29 Apr 2003 13:22:30 -0000 1.121
***************
*** 154,157 ****
--- 154,158 ----
my ( $self, $prefix, $word ) = @_;
+ print "update_pseudoword:$prefix:$word\n";
$self->increment_word( "$prefix:$word" );
}
***************
*** 319,323 ****
}
} else {
! $self->increment_word( 'trick:invisibleink' );
}
}
--- 320,326 ----
}
} else {
! if ( $bigline ne '' ) {
! $self->increment_word( 'trick:invisibleink' );
! }
}
}
***************
*** 722,727 ****
my $found = 1;
! $line =~ s/[\r\n]+/ /g;
! $line =~ s/[\t ]+$//;
print "parse_html: [$line] " . $self->{in_html_tag__} . "\n" if $self->{debug};
--- 725,729 ----
my $found = 1;
! $line =~ s/[\r\n]+//gm;
print "parse_html: [$line] " . $self->{in_html_tag__} . "\n" if $self->{debug};
***************
*** 737,742 ****
$found = 0;
- $line =~ s/^[\t ]+//;
-
# If we are in an HTML tag then look for the close of the tag, if we get it then
# handle the tag, if we don't then keep building up the arguments of the tag
--- 739,742 ----
***************
*** 744,748 ****
if ( $self->{in_html_tag__} ) {
if ( $line =~ s/^(.*?)>// ) {
! $self->{html_arg__} .= ' ' . $1;
$self->{in_html_tag__} = 0;
$self->{html_tag__} =~ s/=\n ?//g;
--- 744,748 ----
if ( $self->{in_html_tag__} ) {
if ( $line =~ s/^(.*?)>// ) {
! $self->{html_arg__} .= $1;
$self->{in_html_tag__} = 0;
$self->{html_tag__} =~ s/=\n ?//g;
***************
*** 754,758 ****
next;
} else {
! $self->{html_arg__} .= ' ' . $line;
return 1;
}
--- 754,758 ----
next;
} else {
! $self->{html_arg__} .= $line;
return 1;
}
***************
*** 1012,1015 ****
--- 1012,1016 ----
if ( $encoding =~ /quoted\-printable/i ) {
$line = decode_qp( $line );
+ $line =~ s/[\r\n]+$//g;
$self->{ut__} = decode_qp( $self->{ut__} ) if ( $self->{color__} );
}
|