From: <ssc...@us...> - 2003-04-27 04:01:18
|
Update of /cvsroot/popfile/engine/Classifier In directory sc8-pr-cvs1:/tmp/cvs-serv20929 Modified Files: MailParse.pm Log Message: improve URL parsing and decoding Index: MailParse.pm =================================================================== RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v retrieving revision 1.119 retrieving revision 1.120 diff -C2 -d -r1.119 -r1.120 *** MailParse.pm 27 Apr 2003 02:57:12 -0000 1.119 --- MailParse.pm 27 Apr 2003 04:01:14 -0000 1.120 *************** *** 419,423 **** } else { ! my $host = add_url( $self, $value, $encoded, $quote, $end_quote, '' ); # If the host name is not blank (i.e. there was a hostname in the url --- 419,423 ---- } else { ! my $host = add_url( $self, $value, $encoded, $quote, $end_quote, '' ); # If the host name is not blank (i.e. there was a hostname in the url *************** *** 586,600 **** # Strip the protocol part of a URL (e.g. http://) ! $protocol = $1 if ( $url =~ s/^(.*)\:\/\/// ); # Remove any URL encoding (protocol may not be URL encoded) ! while ( $url =~ /(\%([0-9A-Fa-f][0-9A-Fa-f]))/g ) { ! my $from = "$1"; ! my $to = chr(hex("0x$2")); ! $url =~ s/$from/$to/g; ! $self->{ut__} =~ s/$from/$to/g; ! print "$from -> $to\n" if $self->{debug}; increment_word( $self, "html:encodedurl" ); } --- 586,598 ---- # Strip the protocol part of a URL (e.g. http://) ! $protocol = $1 if ( $url =~ s/^([^:]*)\:\/\/// ); # Remove any URL encoding (protocol may not be URL encoded) ! if ( $url =~ s/(\%([0-9A-Fa-f][0-9A-Fa-f]))/chr(hex("0x$2"))/eg ) { increment_word( $self, "html:encodedurl" ); + my $new_url = (defined $protocol?"$protocol://":'') . $url; + print "$temp_url -> " . $new_url . "\n" if $self->{debug}; + $self->{ut__} =~ s/$temp_url/$new_url/e if $self->{color__}; } |