|
From: <ssc...@us...> - 2003-04-27 04:01:18
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv20929
Modified Files:
MailParse.pm
Log Message:
improve URL parsing and decoding
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.119
retrieving revision 1.120
diff -C2 -d -r1.119 -r1.120
*** MailParse.pm 27 Apr 2003 02:57:12 -0000 1.119
--- MailParse.pm 27 Apr 2003 04:01:14 -0000 1.120
***************
*** 419,423 ****
} else {
! my $host = add_url( $self, $value, $encoded, $quote, $end_quote, '' );
# If the host name is not blank (i.e. there was a hostname in the url
--- 419,423 ----
} else {
! my $host = add_url( $self, $value, $encoded, $quote, $end_quote, '' );
# If the host name is not blank (i.e. there was a hostname in the url
***************
*** 586,600 ****
# Strip the protocol part of a URL (e.g. http://)
! $protocol = $1 if ( $url =~ s/^(.*)\:\/\/// );
# Remove any URL encoding (protocol may not be URL encoded)
! while ( $url =~ /(\%([0-9A-Fa-f][0-9A-Fa-f]))/g ) {
! my $from = "$1";
! my $to = chr(hex("0x$2"));
! $url =~ s/$from/$to/g;
! $self->{ut__} =~ s/$from/$to/g;
! print "$from -> $to\n" if $self->{debug};
increment_word( $self, "html:encodedurl" );
}
--- 586,598 ----
# Strip the protocol part of a URL (e.g. http://)
! $protocol = $1 if ( $url =~ s/^([^:]*)\:\/\/// );
# Remove any URL encoding (protocol may not be URL encoded)
! if ( $url =~ s/(\%([0-9A-Fa-f][0-9A-Fa-f]))/chr(hex("0x$2"))/eg ) {
increment_word( $self, "html:encodedurl" );
+ my $new_url = (defined $protocol?"$protocol://":'') . $url;
+ print "$temp_url -> " . $new_url . "\n" if $self->{debug};
+ $self->{ut__} =~ s/$temp_url/$new_url/e if $self->{color__};
}
|