|
From: Eric A. <de...@us...> - 2004-03-25 04:51:20
|
Update of /cvsroot/sprawler/sprawler/lib In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26241/lib Modified Files: Extract.pm Sprawler.pm Log Message: - added function from Ilya to check headers for content types - small bug fixes - other little stuff Index: Extract.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Extract.pm,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** Extract.pm 14 Mar 2004 05:54:25 -0000 1.15 --- Extract.pm 25 Mar 2004 04:40:36 -0000 1.16 *************** *** 116,119 **** --- 116,127 ---- } + sub extract_header ($@) { + my $self = shift; + my @header = shift; + my $ctype = (split /\s+/,$header[0])[0]; + $ctype =~ s/\;//; + return $ctype; + } + sub extract_text ($$) { my $self = shift; *************** *** 148,152 **** # clean up anchors and relative paths,etc, here. # need to deal with ../ ! if ($link =~ /^(.+)\#/o) { # anchor reference $link = $1; --- 156,160 ---- # clean up anchors and relative paths,etc, here. # need to deal with ../ ! if ($link && $link =~ /^(.+)\#/o) { # anchor reference $link = $1; *************** *** 202,221 **** # slip through the cracks? $normalized_url = $baseurl . $link; - print "XXX MISSED $url XXX\n"; } if ($normalized_url) { - # print "--->> $normalized_url\n"; for my $c (split(//, $link)) { ! $o=ord($c); ! if ($o<128){ ! $new_link.=$c; ! } else { ! $new_link=""; ! last; ! } } - - push(@links, $normalized_url); } --- 210,225 ---- # slip through the cracks? $normalized_url = $baseurl . $link; } if ($normalized_url) { for my $c (split(//, $link)) { ! $o=ord($c); ! if ($o<128){ ! $new_link.=$c; ! } else { ! $new_link=""; ! last; ! } } push(@links, $normalized_url); } Index: Sprawler.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler.pm,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** Sprawler.pm 15 Mar 2004 05:15:12 -0000 1.7 --- Sprawler.pm 25 Mar 2004 04:40:36 -0000 1.8 *************** *** 85,92 **** $string=sprintf("$format"); } if ($self->{LOGFILE}=~/(.*)/) { $self->{LOGFILE}=$1; } ! open(LOG,">> $self->{LOGFILE}"); print LOG "$timestamp $string"; close; --- 85,93 ---- $string=sprintf("$format"); } + # This looks redundant: if ($self->{LOGFILE}=~/(.*)/) { $self->{LOGFILE}=$1; } ! open(LOG,">> $logfile"); print LOG "$timestamp $string"; close; |