|
From: <jgr...@us...> - 2003-07-26 21:17:21
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv13069/Classifier
Modified Files:
MailParse.pm
Log Message:
Added new test files for mail parsing for handling of some HTML cases not covered in the rest of the test files; fixed various decoding bugs in MailParse along the way; new handling of tests for stopwords
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.150
retrieving revision 1.151
diff -C2 -d -r1.150 -r1.151
*** MailParse.pm 26 Jul 2003 18:28:27 -0000 1.150
--- MailParse.pm 26 Jul 2003 21:17:18 -0000 1.151
***************
*** 387,391 ****
# Pull out any email addresses in the line that are marked with <> and have an @ in them
! while ( $line =~ s/(mailto:)?([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+\.[[:alpha:]0-9\-_]+))([\&\)\?\:\/ >\&\;])// ) {
update_word($self, $2, $encoded, ($1?$1:''), '[\&\?\:\/ >\&\;]', $prefix);
add_url($self, $3, $encoded, '\@', '[\&\?\:\/]', $prefix);
--- 387,391 ----
# Pull out any email addresses in the line that are marked with <> and have an @ in them
! while ( $line =~ s/(mailto:)?([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+\.[[:alpha:]0-9\-_]+))([\"\&\)\?\:\/ >\&\;])// ) {
update_word($self, $2, $encoded, ($1?$1:''), '[\&\?\:\/ >\&\;]', $prefix);
add_url($self, $3, $encoded, '\@', '[\&\?\:\/]', $prefix);
***************
*** 556,564 ****
}
}
-
- next;
}
- add_url( $self, $value, $encoded, $quote, $end_quote, '' );
next;
}
--- 556,561 ----
***************
*** 571,575 ****
if ($value =~ /^mailto:/i) {
! if ( $tag =~ /^a$/ && $value =~ /^mailto:([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))([>\&\?\:\/]|$)/i ) {
update_word( $self, $1, $encoded, 'mailto:', ($3?'[\\\>\&\?\:\/]':$end_quote), '' );
add_url( $self, $2, $encoded, '@', ($3?'[\\\&\?\:\/]':$end_quote), '' );
--- 568,572 ----
if ($value =~ /^mailto:/i) {
! if ( $tag =~ /^a$/ && $value =~ /^mailto:([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))([>\&\?\:\/\" \t]|$)/i ) {
update_word( $self, $1, $encoded, 'mailto:', ($3?'[\\\>\&\?\:\/]':$end_quote), '' );
add_url( $self, $2, $encoded, '@', ($3?'[\\\&\?\:\/]':$end_quote), '' );
***************
*** 602,606 ****
if ( $attribute =~ /^bgsound$/i && $tag =~ /^body$/i ) {
! add_url( $self, $2, $encoded, $quote, $end_quote, '' );
next;
}
--- 599,603 ----
if ( $attribute =~ /^bgsound$/i && $tag =~ /^body$/i ) {
! add_url( $self, $value, $encoded, $quote, $end_quote, '' );
next;
}
***************
*** 677,681 ****
# mailto forms
! if ( $value =~ /^mailto:([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))([>\&\?\:\/])/i ) {
update_word( $self, $1, $encoded, 'mailto:', ($3?'[\\\>\&\?\:\/]':$end_quote), '' );
add_url( $self, $2, $encoded, '@', ($3?'[\\\>\&\?\:\/]':$end_quote), '' );
--- 674,678 ----
# mailto forms
! if ( $value =~ /^mailto:([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))([>\&\?\:\/\" \t]|$)/i ) {
update_word( $self, $1, $encoded, 'mailto:', ($3?'[\\\>\&\?\:\/]':$end_quote), '' );
add_url( $self, $2, $encoded, '@', ($3?'[\\\>\&\?\:\/]':$end_quote), '' );
|