|
From: <ssc...@us...> - 2003-04-07 20:36:59
|
Update of /cvsroot/popfile/engine/Classifier
In directory sc8-pr-cvs1:/tmp/cvs-serv31327
Modified Files:
MailParse.pm
Log Message:
complete multi-line header parsing to colorize correctly
Index: MailParse.pm
===================================================================
RCS file: /cvsroot/popfile/engine/Classifier/MailParse.pm,v
retrieving revision 1.104
retrieving revision 1.105
diff -C2 -d -r1.104 -r1.105
*** MailParse.pm 7 Apr 2003 17:55:46 -0000 1.104
--- MailParse.pm 7 Apr 2003 20:36:52 -0000 1.105
***************
*** 371,375 ****
$value = $3;
$quote = '';
! $end_quote = '[\> \t\&]';
if (defined $2) {
$quote = $2;
--- 371,375 ----
$value = $3;
$quote = '';
! $end_quote = '[\> \t\&\n]';
if (defined $2) {
$quote = $2;
***************
*** 678,682 ****
$line =~ s/[\t ]+$//;
! print "parse_html: [$line] $self->{in_html_tag__}\n" if $self->{debug};
# Remove HTML comments and other tags that begin !
--- 678,682 ----
$line =~ s/[\t ]+$//;
! print "parse_html: [$line] " . $self->{in_html_tag__} . "\n" if $self->{debug};
# Remove HTML comments and other tags that begin !
***************
*** 783,788 ****
$self->{base64__} = '';
!
$self->{in_html_tag__} = 0;
$self->{html_tag__} = '';
$self->{html_arg__} = '';
--- 783,793 ----
$self->{base64__} = '';
!
! # Variable to note that the temporary colorized storage is "frozen",
! # and what type of freeze it is (allows nesting of reasons to freeze
! # colorization)
!
$self->{in_html_tag__} = 0;
+
$self->{html_tag__} = '';
$self->{html_arg__} = '';
***************
*** 823,870 ****
print ">>> $line" if $self->{debug};
!
! if ( $self->{color__} ) {
! my $splitline = $line;
! $splitline =~ s/([^\r\n]{100,120} )/$1\r\n/g;
! $splitline =~ s/([^ \r\n]{120})/$1\r\n/g;
!
! if ( !$self->{in_html_tag__} ) {
! $colorized .= $self->{ut__} if ( $self->{ut__} ne '' );
!
! $self->{ut__} = '';
! }
!
! $splitline =~ s/</</g;
! $splitline =~ s/>/>/g;
!
! #TODO: regress patch to 0.18.1
! if ( $encoding =~ /quoted\-printable/i ) {
! $splitline =~ s/=3C/</g;
! $splitline =~ s/=3E/>/g;
! }
!
! $splitline =~ s/\t/ /g;
!
! $self->{ut__} .= $splitline;
! }
if ($self->{in_headers__}) {
# Check for blank line signifying end of headers
if ( $line =~ /^(\r\n|\r|\n)/) {
!
! # Parse the last header
!
! ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding);
# Clear the saved headers
$header = '';
$argument = '';
$self->{in_headers__} = 0;
print "Header parsing complete.\n" if $self->{debug};
}
# If we have an email header then just keep the part after the :
--- 828,863 ----
print ">>> $line" if $self->{debug};
!
! $colorized .= $self->{ut__};
! $self->{ut__} = '';
!
! $self->{ut__} .= splitline($line, $encoding);
if ($self->{in_headers__}) {
+
+ # temporary colorization while in headers is handled within parse_header
+
+ $self->{ut__} = '';
# Check for blank line signifying end of headers
if ( $line =~ /^(\r\n|\r|\n)/) {
!
! # Parse the last header
! ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding);
# Clear the saved headers
$header = '';
$argument = '';
+
+ $self->{ut__} .= splitline("\015\012", 0);
$self->{in_headers__} = 0;
print "Header parsing complete.\n" if $self->{debug};
+
+ next;
}
+
# If we have an email header then just keep the part after the :
***************
*** 872,877 ****
# Parse the last header
!
! ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding);
# Save the new information for the current header
--- 865,870 ----
# Parse the last header
!
! ($mime,$encoding) = $self->parse_header($header,$argument,$mime,$encoding) if ($header ne '');
# Save the new information for the current header
***************
*** 879,888 ****
$header = $1;
$argument = $2;
}
# Append to argument if the next line begins with whitespace (isn't a new header)
!
! if ( $line =~ /^[\t ](.*?)(\r\n|\r|\n)/ ) {
! $argument .= $1;
}
next;
--- 872,882 ----
$header = $1;
$argument = $2;
+ next;
}
# Append to argument if the next line begins with whitespace (isn't a new header)
!
! if ( $line =~ /^([\t ].*?)(\r\n|\r|\n)/ ) {
! $argument .= "\015\012" . $1;
}
next;
***************
*** 982,985 ****
--- 976,981 ----
$colorized .= clear_out_base64( $self );
close MSG;
+
+ $self->{in_html_tag__} = 0;
if ( $self->{color__} ) {
***************
*** 1099,1105 ****
sub parse_header
{
! my ($self, $header, $argument, $mime, $encoding ) = @_;
print "Header ($header) ($argument)\n" if ($self->{debug});
# After a discussion with Tim Peters and some looking at emails
--- 1095,1107 ----
sub parse_header
{
! my ($self, $header, $argument, $mime, $encoding) = @_;
print "Header ($header) ($argument)\n" if ($self->{debug});
+
+ # Remove over-reading
+ $self->{ut__} = '';
+
+ # Qeueue just this header for colorization
+ $self->{ut__} = splitline("$header: $argument\015\012", $encoding);
# After a discussion with Tim Peters and some looking at emails
***************
*** 1112,1115 ****
--- 1114,1118 ----
# Handle the From, To and Cc headers and extract email addresses
# from them and treat them as words
+
# For certain headers we are going to mark them specially in the corpus
***************
*** 1221,1225 ****
return ($mime, $encoding);
}
!
# GETTERS/SETTERS
--- 1224,1256 ----
return ($mime, $encoding);
}
!
!
! # ---------------------------------------------------------------------------------------------
! #
! # splitline - Escapes characters so a line will print as plain-text within a HTML document.
! #
! # $line The line to escape
! # $encoding The value of any current encoding scheme
! #
! # ---------------------------------------------------------------------------------------------
!
! sub splitline
! {
! my ($line, $encoding) = @_;
! $line =~ s/([^\r\n]{100,120} )/$1\r\n/g;
! $line =~ s/([^ \r\n]{120})/$1\r\n/g;
!
! $line =~ s/</</g;
! $line =~ s/>/>/g;
!
! if ( $encoding =~ /quoted\-printable/i ) {
! $line =~ s/=3C/</g;
! $line =~ s/=3E/>/g;
! }
!
! $line =~ s/\t/ /g;
!
! return $line;
! }
# GETTERS/SETTERS
***************
*** 1231,1233 ****
--- 1262,1269 ----
return $self->{first20__};
}
+
+
+
1;
+
+
|