|
From: Eric A. <de...@us...> - 2004-03-25 04:51:20
|
Update of /cvsroot/sprawler/sprawler/lib/Sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26241/lib/Sprawler Modified Files: Client.pm Master.pm Log Message: - added function from Ilya to check headers for content types - small bug fixes - other little stuff Index: Master.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Master.pm,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** Master.pm 20 Mar 2004 04:26:16 -0000 1.35 --- Master.pm 25 Mar 2004 04:40:36 -0000 1.36 *************** *** 302,306 **** my $db_file=shift; my $fh=shift; ! my %db=(); my $db_obj = tie %db, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE; my $fd = $db_obj->fd; --- 302,306 ---- my $db_file=shift; my $fh=shift; ! my %db={}; my $db_obj = tie %db, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE; my $fd = $db_obj->fd; *************** *** 308,312 **** if ($file =~ /(\+\<\&)\=(.*)/g) { $file=$1."=".$2; ! } open($fh, "$file") or die "fdopen $file $!"; --- 308,312 ---- if ($file =~ /(\+\<\&)\=(.*)/g) { $file=$1."=".$2; ! } open($fh, "$file") or die "fdopen $file $!"; Index: Client.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Client.pm,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** Client.pm 16 Mar 2004 23:18:31 -0000 1.36 --- Client.pm 25 Mar 2004 04:40:36 -0000 1.37 *************** *** 68,75 **** $self->{DOCPATHS}=[]; $self->{URLPATHS}=[]; ! $self->{INDEX_EXT}=[]; $self->{LANGUAGES}=[]; $self->{CHECKOUTDIR}=[]; $self->{TITLEWORDS}=[]; $self->{BOLDWORDS}=[]; --- 68,76 ---- $self->{DOCPATHS}=[]; $self->{URLPATHS}=[]; ! $self->{INDEX_TYPES}=[]; $self->{LANGUAGES}=[]; $self->{CHECKOUTDIR}=[]; + $self->{CONTENT_TYPE}=undef; $self->{TITLEWORDS}=[]; $self->{BOLDWORDS}=[]; *************** *** 301,307 **** my $document=shift; my @array=undef; my $doctext=LWP::Simple::get($document); - #my $baseurl=undef; - #my $domainbase=undef; chomp $document; --- 302,308 ---- my $document=shift; my @array=undef; + my @docheader=LWP::Simple::head($document); + return if (!(@docheader)); #document unavailable my $doctext=LWP::Simple::get($document); chomp $document; *************** *** 328,332 **** $docsize=length($doctext); ! $parser->set_tag('title'); my $title = $parser -> extract_text ("$doctext"); --- 329,337 ---- $docsize=length($doctext); ! $self->{CONTENT_TYPE} = $parser -> extract_header("@docheader"); ! if (grep {! /^$self->{CONTENT_TYPE}/i} @{$self->{INDEX_TYPES}}) { ! print "Skipping $document - content type: $self->{CONTENT_TYPE}\n"; ! return; ! } $parser->set_tag('title'); my $title = $parser -> extract_text ("$doctext"); |