Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv16302/Classifier
Modified Files:
MailParse.pm
Log Message:
Improve the handling of the words found in HTML once HTML tags are removed
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.162
retrieving revision 1.163
diff -C2 -d -r1.162 -r1.163
*** MailParse.pm 22 Sep 2003 13:27:21 -0000 1.162
--- MailParse.pm 24 Sep 2003 15:50:58 -0000 1.163
***************
*** 978,981 ****
--- 978,982 ----
my $found = 1;
+ my $non_html = '';
$line =~ s/[\r\n]+//gm;
***************
*** 994,998 ****
# recognized.
! # FIXME: This also removes tags in plain text emails so a sentence
# such as 'To run the program type "program <filename>".' is also
# effected. The correct fix seams to be to look at the
--- 995,999 ----
# recognized.
! # TODO: This also removes tags in plain text emails so a sentence
# such as 'To run the program type "program <filename>".' is also
# effected. The correct fix seams to be to look at the
***************
*** 1006,1011 ****
# Remove pairs of non-spacing tags without content such as <b></b>
# and also <b><i></i></b>.
!
! # FIXME: What about combined open and close tags such as <b />?
while ( $line =~s/(<($non_spacing_tags)(?:\s+[^>]*?)?><\/\2>)//io ) {
--- 1007,1012 ----
# Remove pairs of non-spacing tags without content such as <b></b>
# and also <b><i></i></b>.
!
! # TODO: What about combined open and close tags such as <b />?
while ( $line =~s/(<($non_spacing_tags)(?:\s+[^>]*?)?><\/\2>)//io ) {
***************
*** 1033,1036 ****
--- 1034,1038 ----
} else {
$self->{html_arg__} .= $line;
+ $self->add_line( $non_html, $encoded, '' );
return 1;
}
***************
*** 1055,1058 ****
--- 1057,1061 ----
$self->{html_arg__} = $3;
$self->{in_html_tag__} = 1;
+ $self->add_line( $non_html, $encoded, '' );
return 1;
}
***************
*** 1064,1070 ****
if ( $line =~ s/^([^<]+)(<|$)/$2/ ) {
$found = 1;
! add_line( $self, $1, $encoded, '' );
}
}
return 0;
--- 1067,1075 ----
if ( $line =~ s/^([^<]+)(<|$)/$2/ ) {
$found = 1;
! $non_html .= $1;
}
}
+
+ $self->add_line( $non_html, $encoded, '' );
return 0;
|