|
From: <jgr...@us...> - 2003-08-22 02:24:04
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv20578
Modified Files:
MailParse.pm
Log Message:
Merge patch to remove invalid HTML tags and empty pairs
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.156
retrieving revision 1.157
diff -C2 -d -r1.156 -r1.157
*** MailParse.pm 20 Aug 2003 06:05:59 -0000 1.156
--- MailParse.pm 21 Aug 2003 17:57:35 -0000 1.157
***************
*** 61,64 ****
--- 61,85 ----
'yuml' => 255 ); # PROFILE BLOCK STOP
+ # All known HTML tags divided into two groups: tags that generate
+ # whitespace as in 'foo<br></br>bar' and tags that don't such as
+ # 'foo<b></b>bar'. The first case shouldn't count as an empty pair
+ # because it breaks the line. The second case doesn't have any visual
+ # impact and it treated as 'foobar' with an empty pair.
+
+ my $spacing_tags = "address|applet|area|base|basefont" . # PROFILE BLOCK START
+ "|bdo|bgsound|blockquote|body|br|button|caption" .
+ "|center|col|colgroup|dd|dir|div|dl|dt|embed" .
+ "|fieldset|form|frame|frameset|h1|h2|h3|h4|h5|h6" .
+ "|head|hr|html|iframe|ilayer|input|isindex|label" .
+ "|legend|li|link|listing|map|menu|meta|multicol" .
+ "|nobr|noembed|noframes|nolayer|noscript|object" .
+ "|ol|optgroup|option|p|param|plaintext|pre|script" .
+ "|select|spacer|style|table|tbody|td|textarea" .
+ "|tfoot|th|thead|title|tr|ul|wbr|xmp"; # PROFILE BLOCK STOP
+
+ my $non_spacing_tags = "a|abbr|acronym|b|big|blink" . # PROFILE BLOCK START
+ "|cite|code|del|dfn|em|font|i|img|ins|kbd|q|s" .
+ "|samp|small|span|strike|strong|sub|sup|tt|u|var"; # PROFILE BLOCK STOP
+
#----------------------------------------------------------------------------
# new
***************
*** 912,915 ****
--- 933,960 ----
$self->update_pseudoword( 'html', 'comment', $encoded, $1 );
print "$line\n" if $self->{debug__};
+ }
+
+ # Remove invalid tags. This finds tags of the form [a-z0-9]+ with
+ # optional attributes and removes them if the tag isn't
+ # recognized.
+
+ # FIXME: This also removes tags in plain text emails so a sentence
+ # such as 'To run the program type "program <filename>".' is also
+ # effected. The correct fix seams to be to look at the
+ # Content-Type header and only process mails of type text/html.
+
+ while ( $line =~ s/(<\/?(?!(?:$spacing_tags|$non_spacing_tags)\W)[a-z0-9]+(?:\s+.*?)?\/?>)//i ) {
+ $self->update_pseudoword( 'html', 'invalidtag', $encoded, $1 );
+ print "html:invalidtag: $1\n" if $self->{debug};
+ }
+
+ # Remove pairs of non-spacing tags without content such as <b></b>
+ # and also <b><i></i></b>.
+
+ # FIXME: What about combined open and close tags such as <b />?
+
+ while ( $line =~s/(<($non_spacing_tags)(?:\s+[^>]*?)?><\/\2>)//i ) {
+ $self->update_pseudoword( 'html', 'emptypair', $encoded, $1 );
+ print "html:emptypair: $1\n" if $self->{debug};
}
|