|
From: <jgr...@us...> - 2003-06-09 18:33:39
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv21196/Classifier
Modified Files:
MailParse.pm
Log Message:
Prevent duplicate call to add_url when parsing img tags, prevent spaces in charset decodes
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.131
retrieving revision 1.132
diff -C2 -d -r1.131 -r1.132
*** MailParse.pm 29 May 2003 10:49:06 -0000 1.131
--- MailParse.pm 9 Jun 2003 18:33:36 -0000 1.132
***************
*** 447,450 ****
--- 447,452 ----
}
}
+
+ next;
}
***************
*** 536,540 ****
if ( ( $attribute =~ /^content$/i ) && ( $tag =~ /^meta$/i ) ) {
! if ( $value=~ /charset=(.{1,40})[\"\>]?/ ) {
update_word( $self, $1, $encoded, '', '', '' );
}
--- 538,542 ----
if ( ( $attribute =~ /^content$/i ) && ( $tag =~ /^meta$/i ) ) {
! if ( $value=~ /charset=([^ ]{1,40})[\"\>]?/ ) {
update_word( $self, $1, $encoded, '', '', '' );
}
***************
*** 621,624 ****
--- 623,628 ----
$authinfo = $1 if ( $url =~ s/^(([[:alpha:]0-9\-_\.\;\:\&\=\+\$\,]+)(\@|\%40))+// );
+ $self->update_pseudoword( 'html', 'authorization', $encoded, $oldurl ) if ( defined( $authinfo ) && ( $authinfo ne '' ) );
+
if ( $url =~ s/^(([[:alpha:]0-9\-_]+\.)+)(com|edu|gov|int|mil|net|org|aero|biz|coop|info|museum|name|pro|[[:alpha:]]{2})([^[:alpha:]0-9\-_\.]|$)/$4/i ) {
$host = "$1$3";
***************
*** 1021,1025 ****
# Look for =?foo? syntax that identifies a charset
! if ( $line =~ /=\?(.{1,40})\?/ ) {
update_word( $self, $1, 0, '', '', 'charset' );
}
--- 1025,1029 ----
# Look for =?foo? syntax that identifies a charset
! if ( $line =~ /=\?([^ ]{1,40})\?/ ) {
update_word( $self, $1, 0, '', '', 'charset' );
}
***************
*** 1192,1196 ****
# Check the encoding type in all RFC 2047 encoded headers
! if ( $argument =~ /=\?(.{1,40})\?(Q|B)/i ) {
update_word( $self, $1, 0, '', '', 'charset' );
}
--- 1196,1200 ----
# Check the encoding type in all RFC 2047 encoded headers
! if ( $argument =~ /=\?([^ ]{1,40})\?(Q|B)/i ) {
update_word( $self, $1, 0, '', '', 'charset' );
}
***************
*** 1215,1219 ****
$argument = $self->decode_string( $argument );
! if ( $argument =~ /=\?(.{1,40})\?/ ) {
update_word( $self, $1, 0, '', '', 'charset' );
}
--- 1219,1223 ----
$argument = $self->decode_string( $argument );
! if ( $argument =~ /=\?([^ ]{1,40})\?/ ) {
update_word( $self, $1, 0, '', '', 'charset' );
}
***************
*** 1260,1264 ****
if ( $header =~ /^Content-Type$/i ) {
! if ( $argument =~ /charset=\"?([^\"]{1,40})\"?/ ) {
update_word( $self, $1, 0, '' , '', 'charset' );
}
--- 1264,1268 ----
if ( $header =~ /^Content-Type$/i ) {
! if ( $argument =~ /charset=\"?([^\" ]{1,40})\"?/ ) {
update_word( $self, $1, 0, '' , '', 'charset' );
}
|