From: Wim K. <wi...@us...> - 2002-04-09 04:22:03
|
Update of /cvsroot/acmemail/sparkle/Acmemail/Filter In directory usw-pr-cvs1:/tmp/cvs-serv31893/Acmemail/Filter Added Files: filterhtml.pl Log Message: Adding whitelist filtering, using the NMS filter library from http://nick.cleaton.net/projects/nmsfilt.html See the sparkle mailing archives for 'defang vulnerabilites' for more information. --- NEW FILE: filterhtml.pl --- # # Copyright 2001 London Perl Mongers, All rights reserved # # This file is free software; you are free to redistibute it # and/or modify it under the same terms as Perl itself. # ################################################################## # # HTML handling code # # The code below provides some functions for manipulating HTML. # # check_url_valid ( URL ) # # Returns 1 if the string URL is a valid http, https or ftp # URL, 0 otherwise. # # process_html ( INPUT [,LINE_BREAKS [,ALLOW]] ) # # Returns a modified version of the HTML string INPUT, with # any potentially malicious HTML constructs (such as java, # javascript and IMG tags) removed. # # If the LINE_BREAKS parameter is present and true then # line breaks in the input will be converted to html <br /> # tags in the output. # # If the ALLOW parameter is present and true then most # harmless tags will be left in, otherwise all tags will be # removed. # # escape_html ( INPUT ) # # Returns a copy of the string INPUT with any HTML # metacharacters replaced with character escapes. # # unescape_html ( INPUT ) # # Returns a copy of the string INPUT with HTML character # entities converted to literal characters where possible. # Note that some entites have no 8-bit character equivalent, # see "http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent" # for some examples. unescape_html() leaves these entities # in their encoded form. # use vars qw(%html_entities $html_safe_chars %escape_html_map); use vars qw(%safe_tags %safe_style %tag_is_empty $convert_nl %auto_deinterleave $auto_deinterleave_pattern); # check the validity of a URL. sub check_url_valid { my $url = shift; $url =~ m< ^ (?:ftp|http|https):// [\w\-\.]+ (?:\:\d+)? (?: / [\w\-.!~*'(|);/?\@&=+\$,%#]* )? $ >x ? 1 : 0; } sub process_html { my ($text, $line_breaks, $allow_html) = @_; cleanup_html( $text, $line_breaks, ($allow_html ? \%safe_tags : {}) ); } BEGIN { %html_entities = ( 'lt' => '<', 'gt' => '>', 'quot' => '"', 'amp' => '&', 'nbsp' => "\240", 'iexcl' => "\241", 'cent' => "\242", 'pound' => "\243", 'curren' => "\244", 'yen' => "\245", 'brvbar' => "\246", 'sect' => "\247", 'uml' => "\250", 'copy' => "\251", 'ordf' => "\252", 'laquo' => "\253", 'not' => "\254", 'shy' => "\255", 'reg' => "\256", 'macr' => "\257", 'deg' => "\260", 'plusmn' => "\261", 'sup2' => "\262", 'sup3' => "\263", 'acute' => "\264", 'micro' => "\265", 'para' => "\266", 'middot' => "\267", 'cedil' => "\270", 'supl' => "\271", 'ordm' => "\272", 'raquo' => "\273", 'frac14' => "\274", 'frac12 '=> "\275", 'frac34' => "\276", 'iquest' => "\277", 'Agrave' => "\300", 'Aacute' => "\301", 'Acirc' => "\302", 'Atilde' => "\303", 'Auml' => "\304", 'Aring' => "\305", 'AElig' => "\306", 'Ccedil' => "\307", 'Egrave' => "\310", 'Eacute' => "\311", 'Ecirc' => "\312", 'Euml' => "\313", 'Igrave' => "\314", 'Iacute' => "\315", 'Icirc' => "\316", 'Iuml' => "\317", 'ETH' => "\320", 'Ntilde' => "\321", 'Ograve' => "\322", 'Oacute' => "\323", 'Ocirc' => "\324", 'Otilde' => "\325", 'Ouml' => "\326", 'times' => "\327", 'Oslash' => "\330", 'Ugrave' => "\331", 'Uacute' => "\332", 'Ucirc' => "\333", 'Uuml' => "\334", 'Yacute' => "\335", 'THORN' => "\336", 'szlig' => "\337", 'agrave' => "\340", 'aacute' => "\341", 'acirc' => "\342", 'atilde' => "\343", 'auml' => "\344", 'aring' => "\345", 'aelig' => "\346", 'ccedil' => "\347", 'egrave' => "\350", 'eacute' => "\351", 'ecirc' => "\352", 'euml' => "\353", 'igrave' => "\354", 'iacute' => "\355", 'icirc' => "\356", 'iuml' => "\357", 'eth' => "\360", 'ntilde' => "\361", 'ograve' => "\362", 'oacute' => "\363", 'ocirc' => "\364", 'otilde' => "\365", 'ouml' => "\366", 'divide' => "\367", 'oslash' => "\370", 'ugrave' => "\371", 'uacute' => "\372", 'ucirc' => "\373", 'uuml' => "\374", 'yacute' => "\375", 'thorn' => "\376", 'yuml' => "\377", ); # # Build a map for representing characters in HTML. # $html_safe_chars = '()[]{}/?.,\\|;:@#~=+-_*^%$! ' . "\r\n\t"; %escape_html_map = map {$_,$_} ( 'A'..'Z', 'a'..'z', '0'..'9', split(//, $html_safe_chars) ); foreach my $ent (keys %html_entities) { $escape_html_map{$html_entities{$ent}} = "&$ent;"; } foreach my $c (0..255) { unless ( exists $escape_html_map{chr $c} ) { $escape_html_map{chr $c} = sprintf '&#%d;', $c; } } # # Tables for use by cleanup_html() (below). # # The main table is %safe_tags, which is a hash by tag name of # all the tags that it's safe to leave in. The value for each # tag is another hash, and each key of that hash defines an # attribute that the tag is allowed to have. # # The values in the tag attribute hash can be undef (for an # attribute that takes no value, for example the nowrap # attribute in the tag <td align="left" nowrap>) or they can # be coderefs pointing to subs for cleaning up the attribute # values. # # These subs will called with the attribute value in $_, and # they can return either a cleaned attribute value or undef. # If undef is returned then the attribute will be deleted # from the tag. # # The list of tags and attributes was taken from # "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" # # The %tag_is_empty table defines the set of tags that have # no corresponding close tag. # # cleanup_html() moves close tags around to force all tags to # be closed in the correct sequence. For example, the text # "<h1><i>foo</h1>bar</i>" will be converted to the text # "<h1><i>foo</i></h1>bar". # # The %auto_deinterleave table defines the set of tags which # should be automatically reopened if they're closed early # in this way. All the tags involved must be in # %auto_deinterleave for the tag to be reopened. For example, # the text "<b>bb<i>bi</b>ii</i>" will be converted into the # text "<b>bb<i>bi</i></b><i>ii</i>" rather than into the # text "<b>bb<i>bi</i></b>ii", because *both* "b" and "i" are # in %auto_deinterleave. # %tag_is_empty = ( 'hr' => 1, 'br' => 1, 'basefont' => 1 ); %auto_deinterleave = map {$_,1} qw( tt i b big small u s strike font basefont em strong dfn code q sub sup samp kbd var cite abbr acronym span ); $auto_deinterleave_pattern = join '|', keys %auto_deinterleave; my %attr = ( 'style' => \&cleanup_attr_style ); my %font_attr = ( %attr, size => sub { /^([-+]?\d{1,3})$/ ? $1 : undef }, face => sub { /^([\w\-, ]{2,100})$/ ? $1 : undef }, color => \&cleanup_attr_color, ); my %insdel_attr = ( %attr, 'cite' => \&cleanup_attr_uri, 'datetime' => \&cleanup_attr_text, ); my %texta_attr = ( %attr, align => sub { s/middle/center/i; /^(left|center|right)$/i ? lc $1 : undef }, ); my %cellha_attr = ( align => sub { s/middle/center/i; /^(left|center|right|justify|char)$/i ? lc $1 : undef }, char => sub { /^([\w\-])$/ ? $1 : undef }, charoff => \&cleanup_attr_length, ); my %cellva_attr = ( valign => sub { s/center/middle/i; /^(top|middle|bottom|baseline)$/i ? lc $1 : undef }, ); my %cellhv_attr = ( %attr, %cellha_attr, %cellva_attr ); my %col_attr = ( %attr, width => \&cleanup_attr_multilength, span => \&cleanup_attr_number, %cellhv_attr, ); my %thtd_attr = ( %attr, abbr => \&cleanup_attr_text, axis => \&cleanup_attr_text, headers => \&cleanup_attr_text, scope => sub { /^(row|col|rowgroup|colgroup)$/i ? lc $1 : undef }, rowspan => \&cleanup_attr_number, colspan => \&cleanup_attr_number, %cellhv_attr, nowrap => undef, bgcolor => \&cleanup_attr_color, width => \&cleanup_attr_number, height => \&cleanup_attr_number, ); my $none = {}; %safe_tags = ( 'br' => { 'clear' => sub { /^(left|right|all|none)$/i ? lc $1 : undef } }, 'em' => \%attr, 'strong' => \%attr, 'dfn' => \%attr, 'code' => \%attr, 'samp' => \%attr, 'kbd' => \%attr, 'var' => \%attr, 'cite' => \%attr, 'abbr' => \%attr, 'acronym' => \%attr, 'q' => { %attr, 'cite' => \&cleanup_attr_uri }, 'blockquote' => { %attr, 'cite' => \&cleanup_attr_uri }, 'sub' => \%attr, 'sup' => \%attr, 'tt' => \%attr, 'i' => \%attr, 'b' => \%attr, 'big' => \%attr, 'small' => \%attr, 'u' => \%attr, 's' => \%attr, 'font' => \%font_attr, 'table' => { %attr, 'frame' => \&cleanup_attr_tframe, 'rules' => \&cleanup_attr_trules, %texta_attr, 'bgcolor' => \&cleanup_attr_color, 'width' => \&cleanup_attr_length, 'cellspacing' => \&cleanup_attr_length, 'cellpadding' => \&cleanup_attr_length, 'border' => \&cleanup_attr_number, 'summary' => \&cleanup_attr_text, }, 'caption' => { %attr, 'align' => sub { /^(top|bottom|left|right)$/i ? lc $1 : undef }, }, 'colgroup' => \%col_attr, 'col' => \%col_attr, 'thead' => \%cellhv_attr, 'tfoot' => \%cellhv_attr, 'tbody' => \%cellhv_attr, 'tr' => { %attr, bgcolor => \&cleanup_attr_color, %cellhv_attr, }, 'th' => \%thtd_attr, 'td' => \%thtd_attr, 'ins' => \%insdel_attr, 'del' => \%insdel_attr, 'a' => { %attr, href => \&cleanup_attr_uri, }, 'h1' => \%texta_attr, 'h2' => \%texta_attr, 'h3' => \%texta_attr, 'h4' => \%texta_attr, 'h5' => \%texta_attr, 'h6' => \%texta_attr, 'p' => \%texta_attr, 'div' => \%texta_attr, 'span' => \%texta_attr, 'ul' => { %attr, 'type' => sub { /^(disc|square|circle)$/i ? lc $1 : undef }, 'compact' => undef, }, 'ol' => { %attr, 'type' => \&cleanup_attr_text, 'compact' => undef, 'start' => \&cleanup_attr_number, }, 'li' => { %attr, 'type' => \&cleanup_attr_text, 'value' => \&cleanup_no_number, }, 'dl' => { %attr, 'compact' => undef }, 'dt' => \%attr, 'dd' => \%attr, 'address' => \%attr, 'pre' => { %attr, 'width' => \&cleanup_attr_number }, 'center' => \%attr, 'nobr' => $none, ); %safe_style = ( 'color' => \&cleanup_attr_color, 'background-color' => \&cleanup_attr_color, # XXX TODO: the CSS spec defines loads more, add 'em ); } sub cleanup_attr_style { my @clean = (); foreach my $elt (split /;/, $_) { next if $elt =~ m#^\s*$#; if ( $elt =~ m#^\s*([\w\-]+)\s*:\s*(.+?)\s*$#s ) { my ($key, $val) = (lc $1, $2); local $_ = $val; my $sub = $safe_style{$key}; if (defined $sub) { my $cleanval = &{$sub}(); if (defined $cleanval) { push @clean, "$key:$val"; } elsif ($DEBUGGING) { push @debug_msg, "style $key: bad value <$val>"; } } elsif ($DEBUGGING) { push @debug_msg, "rejected style element <$key>"; } } elsif ($DEBUGGING) { push @debug_msg, "malformed style element <$elt>"; } } return join '; ', @clean; } sub cleanup_attr_number { /^(\d+)$/ ? $1 : undef; } sub cleanup_attr_multilength { /^(\d+(?:\.\d+)?[*%]?)$/ ? $1 : undef; } sub cleanup_attr_text { tr/-a-zA-Z0-9()[]{}\/?.,\\|;:@#~=+*^%$! //dc; $_; } sub cleanup_attr_length { /^(\d+\%?)$/ ? $1 : undef; } sub cleanup_attr_color { /^(\w{2,20}|#[\da-fA-F]{6})$/ or die "color <<$_>> bad"; /^(\w{2,20}|#[\da-fA-F]{6})$/ ? $1 : undef; } sub cleanup_attr_uri { check_url_valid($_) ? $_ : undef; } sub cleanup_attr_tframe { /^(void|above|below|hsides|lhs|rhs|vsides|box|border)$/i ? lc $1 : undef; } sub cleanup_attr_trules { /^(none|groups|rows|cols|all)$/i ? lc $1 : undef; } use vars qw(@stack $safe_tags $convert_nl); sub cleanup_html { local ($_, $convert_nl, $safe_tags) = @_; local @stack = (); s[ (?: <!--.*?--> ) | (?: <[?!].*?> ) | (?: <([a-z0-9]+)\b((?:[^>'"]|"[^"]*"|'[^']*')*)> ) | (?: </([a-z0-9]+)> ) | (?: (.[^<]*) ) ][ defined $1 ? cleanup_tag(lc $1, $2) : defined $3 ? cleanup_close(lc $3) : defined $4 ? cleanup_cdata($4) : '' ]igesx; # Close anything that was left open $_ .= join '', map "</$_->{NAME}>", @stack; # Where we turned <i><b>foo</i></b> into <i><b>foo</b></i><b></b>, # take out the pointless <b></b>. 1 while s#<($auto_deinterleave_pattern)\b[^>]*></\1>##go; return $_; } sub cleanup_tag { my ($tag, $attrs) = @_; unless (exists $safe_tags->{$tag}) { push @debug_msg, "reject tag <$tag>" if $DEBUGGING; return ''; } my $t = $safe_tags->{$tag}; my $safe_attrs = ''; while ($attrs =~ s#^\s*(\w+)(?:\s*=\s*(?:([^"'>\s]+)|"([^"]*)"|'([^']*)'))?##) { my $attr = lc $1; my $val = ( defined $2 ? $2 : defined $3 ? unescape_html($3) : defined $4 ? unescape_html($4) : '' ); unless (exists $t->{$attr}) { push @debug_msg, "<$tag>: attr '$attr' rejected" if $DEBUGGING; next; } if (defined $t->{$attr}) { local $_ = $val; my $cleaned = &{ $t->{$attr} }(); if (defined $cleaned) { $safe_attrs .= qq| $attr="${\( escape_html($cleaned) )}"|; if ($DEBUGGING and $cleaned ne $val) { push @debug_msg, "<$tag>'$attr':val [$val]->[$cleaned]"; } } elsif ($DEBUGGING) { push @debug_msg, "<$tag>'$attr':val [$val] rejected"; } } else { $safe_attrs .= " $attr"; } } if (exists $tag_is_empty{$tag}) { return "<$tag$safe_attrs />"; } else { my $html = "<$tag$safe_attrs>"; unshift @stack, { NAME => $tag, FULL => $html }; return $html; } } sub cleanup_close { my $tag = shift; # Ignore a close without an open unless (grep {$_->{NAME} eq $tag} @stack) { push @debug_msg, "misplaced </$tag> rejected" if $DEBUGGING; return ''; } # Close open tags up to the matching open my @close = (); while (scalar @stack and $stack[0]{NAME} ne $tag) { push @close, shift @stack; } push @close, shift @stack; my $html = join '', map {"</$_->{NAME}>"} @close; # Reopen any we closed early if all that were closed are # configured to be auto deinterleaved. unless (grep {! exists $auto_deinterleave{$_->{NAME}} } @close) { pop @close; $html .= join '', map {$_->{FULL}} reverse @close; unshift @stack, @close; } return $html; } sub cleanup_cdata { local $_ = shift; s[ (?: & ( [a-zA-Z0-9]{2,15} | [#][0-9]{2,6} | [#][xX][a-fA-F0-9]{2,6} | ) \b ;? ) | (.) ][ defined $1 ? "&$1;" : $escape_html_map{$2} ]gesx; # substitute newlines in the input for html line breaks if required. s%\cM\n%<br />\n%g if $convert_nl; return $_; } # subroutine to escape the necessary characters to the appropriate HTML # entities sub escape_html { my $str = shift; $str =~ s/([^\w\Q$html_safe_chars\E])/$escape_html_map{$1}/og; return $str; } # subroutine to unescape escaped HTML entities. Note that some entites # have no 8-bit character equivalent, see # "http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent" for some examples. # unescape_html() leaves these entities in their encoded form. sub unescape_html { my $str = shift; $str =~ s/ &( (\w+) | [#](\d+) ) \b (;?) / defined $2 && exists $html_entities{$2} ? $html_entities{$2} : defined $3 && $3 > 0 && $3 <= 255 ? chr $3 : "&$1$4" /gex; return $str; } # # End of HTML handling code # ################################################################## 1; |