From: Eric A. <de...@us...> - 2004-03-28 02:00:53
|
Update of /cvsroot/sprawler/sprawler/lib/Sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv31736/lib/Sprawler Modified Files: Client.pm Master.pm Log Message: - Added routines to check for client validity. - Clients can now only check in url info for urls they themselves have checked out - Client must "register" first before being able to run. - Minor bug fixes and tweaks - some minor whitespace fixing - Config file changes (subtle) - Uses a default pre-registered clientid, but in the future, users will have to register their own. Index: Master.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Master.pm,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** Master.pm 25 Mar 2004 04:40:36 -0000 1.36 --- Master.pm 28 Mar 2004 01:49:40 -0000 1.37 *************** *** 114,117 **** --- 114,118 ---- my $self=shift; my $socket=shift; + my $clientid=shift; my $msg=undef; my $index_path=$self->{INDEX_PATH}; *************** *** 138,154 **** $filerecvsize = length $file; } ! print STDERR "RECEIVED index $filename\n"; ! print STDERR "Filesizes: $filesize ", length($file), "\n"; ! $self->logf("RECEIVED index $filename\n"); $self->logf("Filesizes: $filesize \n"); if($filesize == length($file)) { - $socket->send("0"); - } else { $socket->send("1"); } $db_file=$index_path.$filename; $tmp_file=$index_path.$filename.".mem"; if ( $tmp_file =~ /(.*)/ ) { ! $tmp_file=1; } --- 139,155 ---- $filerecvsize = length $file; } ! print STDERR "RECEIVED $filesize bytes of ", length($file), " in $filename\n"; ! $self->logf("RECEIVED index $filename from $clientid\n"); $self->logf("Filesizes: $filesize \n"); if($filesize == length($file)) { $socket->send("1"); + } else { + $socket->send("0"); } $db_file=$index_path.$filename; $tmp_file=$index_path.$filename.".mem"; + # what is this statement supposed to accomplish? if ( $tmp_file =~ /(.*)/ ) { ! $tmp_file=$1; } *************** *** 156,177 **** print INDEX $file; close INDEX; ! $tmp_db = retrieve("$tmp_file"); ! #foreach $key (keys %{$tmp_db}) { ! # print "KEY $key VALUE ${$tmp_db}{$key}"; ! #} ! print STDERR "WRITING index $db_file \n"; $self->logf("WRITING index $db_file\n"); unlink($tmp_file); my ($obj0,$fdesc0,$dbh0)=$self->open_db($db_file, "RECEIVE"); - %{$dbh0}=%{$tmp_db}; - $self->close_db($obj0, $fdesc0, "RECEIVE"); ! print STDERR "extracting new urls\n"; ! $self->logf("extracting new urls\n"); my @url=undef; @url=$self->extract_urls($db_file); ! $self->add_urls(\@url); } --- 157,199 ---- print INDEX $file; close INDEX; ! $tmp_db = retrieve("$tmp_file"); ! ! #print STDERR "WRITING index $db_file \n"; $self->logf("WRITING index $db_file\n"); unlink($tmp_file); + my ($obj0,$fdesc0,$dbh0)=$self->open_db($db_file, "RECEIVE"); ! my $url_file_stateone=$index_path . "master_urls_state1.db"; ! my ($obj1, $fdesc1, $dbh1)=$self->open_db($url_file_stateone, "ONE"); ! ! my $url_file_statetwo=$index_path . "master_urls_state2.db"; ! my ($obj2, $fdesc2, $dbh2)=$self->open_db($url_file_statetwo, "TWO"); ! ! my $url=${$tmp_db}{URL}; ! my $stateone_clientid=$dbh1->{$url}; ! if ($stateone_clientid eq $clientid) { ! # client checking in indexed db- change url from stage 1 to stage 2 ! delete $dbh1->{$url}; ! $dbh2->{$url}=$clientid; ! %{$dbh0}=%{$tmp_db}; ! $self->close_db($obj0, $fdesc0, "RECEIVE"); ! } else { ! # this client did not check out this URL! ! # quietly ignore the data ! $self->close_db($obj0, $fdesc0, "RECEIVE"); ! unlink($db_file); ! print STDERR "Client did not check out this url!: $url\n"; ! } ! $self->close_db($obj1, $fdesc1, "ONE"); ! $self->close_db($obj2, $fdesc2, "TWO"); ! ! print STDERR "Extracting new urls\n"; ! $self->logf("Extracting new urls\n"); my @url=undef; @url=$self->extract_urls($db_file); ! $self->add_urls(\@url,$clientid); } *************** *** 217,228 **** --- 239,255 ---- sub get_urls { + no strict 'refs'; + my $self=shift; my $n_urls=shift; + my $clientid=shift; my $r_urls=0; my $indexpath=$self->get("INDEX_PATH"); my $url_file_statezero=$indexpath . "master_urls_state0.db"; my $url_file_stateone=$indexpath . "master_urls_state1.db"; + my $client_checkoutdb=$indexpath . "client_checkout.db"; my %dbh0=(); my %dbh1=(); + my %dbh2=(); my @urls=(); my $ii=0; *************** *** 232,236 **** my ($key,$value) = each %{$dbh0}; if (int(rand(1000)) < 3) { ! $dbh1->{$key}="client id"; # print "state 0 to 1 -> $key\n"; delete $dbh0->{$key}; --- 259,263 ---- my ($key,$value) = each %{$dbh0}; if (int(rand(1000)) < 3) { ! $dbh1->{$key}="$clientid"; # print "state 0 to 1 -> $key\n"; delete $dbh0->{$key}; *************** *** 249,252 **** --- 276,280 ---- my $self=shift; my $urls=shift; + my $clientid=shift; my %url_db_0=(); my %url_db_1=(); *************** *** 265,278 **** foreach my $url (@{$urls}) { ! if(($dbh0->{$url}) || ($dbh1->{$url}) || ($dbh2->{$url}) || ($dbh3->{$url})) { # dont index again } else { # add it to the state 0 (to be indexed) db ! $dbh0->{$url}="client id"; $ii++; } $jj++; } ! $self->logf("added $ii (of $jj) to the master_urls_state0.db\n"); $self->close_db($obj0, $fdesc0, "ZERO"); $self->close_db($obj1, $fdesc1, "ONE"); --- 293,306 ---- foreach my $url (@{$urls}) { ! if((exists $dbh0->{$url}) || (exists $dbh1->{$url}) || (exists $dbh2->{$url}) || (exists $dbh3->{$url})) { # dont index again } else { # add it to the state 0 (to be indexed) db ! $dbh0->{$url}="$clientid"; $ii++; } $jj++; } ! $self->logf("added $ii (of $jj) to the master_urls_state0.db by client $clientid\n"); $self->close_db($obj0, $fdesc0, "ZERO"); $self->close_db($obj1, $fdesc1, "ONE"); *************** *** 302,306 **** my $db_file=shift; my $fh=shift; ! my %db={}; my $db_obj = tie %db, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE; my $fd = $db_obj->fd; --- 330,334 ---- my $db_file=shift; my $fh=shift; ! my %db=(); my $db_obj = tie %db, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE; my $fd = $db_obj->fd; *************** *** 311,315 **** open($fh, "$file") or die "fdopen $file $!"; ! # open($fh, "+<&=$fd") or die "fdopen $!"; unless (flock ($fh, LOCK_EX | LOCK_NB)) { unless (flock ($fh, LOCK_EX)) { die "flock: $!" } --- 339,343 ---- open($fh, "$file") or die "fdopen $file $!"; ! #open($fh, "+<&=$fd") or die "fdopen $!"; unless (flock ($fh, LOCK_EX | LOCK_NB)) { unless (flock ($fh, LOCK_EX)) { die "flock: $!" } *************** *** 330,333 **** --- 358,379 ---- } + sub check_clientid { + no strict 'refs'; + + my $self=shift; + my $clientid=shift; + my $request=shift; + my $index_path=$self->get("INDEX_PATH"); + my $clientenabled = undef; + my %dbh0=(); + my $client_checkoutdb = $index_path."client_checkout.db"; + my ($obj0, $fdesc0, $dbh0)=$self->open_db($client_checkoutdb, "CLIENTDB"); + my $clientid_tag = $clientid . "-" . $request; + my $junk = $dbh0->{$clientid_tag}; + my $clientid_return=$dbh0->{$clientid_tag}; + $self->close_db($obj0, $fdesc0, "CLIENTDB"); + return ($clientid_return); + } + sub local_index_dir { my $self=shift; Index: Client.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Client.pm,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** Client.pm 25 Mar 2004 05:44:04 -0000 1.39 --- Client.pm 28 Mar 2004 01:49:40 -0000 1.40 *************** *** 91,95 **** $self->{REQUEST_FILE}; $self->{MASTER_REQUEST}; ! $self->{CLIENTID}; $self->{SOCKET}=undef; --- 91,95 ---- $self->{REQUEST_FILE}; $self->{MASTER_REQUEST}; ! $self->{CLIENT_ID}; $self->{SOCKET}=undef; *************** *** 469,483 **** store $tmp_db, "$db_file\.tmp"; rename("$db_file\.tmp", "$db_file"); ! ! #tie %url_db, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE; ! #%url_db=%{$tmp_db}; ! #untie %url_db; ! # %{$tmp_db}=undef; ! # $tmp_db=undef; rename("$index_path/url.db.new","$index_path/url.db"); unlink "$index_path/url.db.new"; - # tie %db, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE; - return 1; } --- 469,476 ---- store $tmp_db, "$db_file\.tmp"; rename("$db_file\.tmp", "$db_file"); ! rename("$index_path/url.db.new","$index_path/url.db"); unlink "$index_path/url.db.new"; return 1; } *************** *** 502,510 **** my $self=shift; my $filename=shift; ! my $socket=shift;; ! my $index_path=$self->{INDEX_PATH}; my $file=undef; ! $socket->send("SEND_INDEX"); $socket->recv($msg,1024); if ($msg eq "READY") { --- 495,502 ---- my $self=shift; my $filename=shift; ! my $socket=shift; my $index_path=$self->{INDEX_PATH}; my $file=undef; ! $socket->send("SEND_INDEX $self->{CLIENT_ID}"); $socket->recv($msg,1024); if ($msg eq "READY") { *************** *** 549,552 **** --- 541,545 ---- my $self=shift; my $socket=shift; + my $clientid=$self->{CLIENT_ID}; my $urltotal=$self->{URLS_TO_INDEX}; my $index_path=$self->get("INDEX_PATH"); *************** *** 556,560 **** print "Requesting urls\n"; ! $socket->send("REQUEST_URL $urltotal"); #start waiting for $urls; #my $ii=0; --- 549,553 ---- print "Requesting urls\n"; ! $socket->send("REQUEST_URL $urltotal $clientid"); #start waiting for $urls; #my $ii=0; |