You can subscribe to this list here.
| 2003 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
(7) |
Oct
(1) |
Nov
|
Dec
|
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2004 |
Jan
(41) |
Feb
(40) |
Mar
(55) |
Apr
(1) |
May
|
Jun
(3) |
Jul
|
Aug
(4) |
Sep
|
Oct
|
Nov
|
Dec
|
|
From: Mojo N. <moj...@us...> - 2004-08-18 03:51:36
|
Update of /cvsroot/sprawler/sprawler/lib In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv11522/lib Modified Files: Sprawler.pm Log Message: It was simply a globablly scope variable $dir. Locally scoping it fixed it. Mojo Index: Sprawler.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler.pm,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** Sprawler.pm 18 Aug 2004 03:41:37 -0000 1.10 --- Sprawler.pm 18 Aug 2004 03:51:25 -0000 1.11 *************** *** 82,90 **** my $self=shift; my $path=shift; - if (! -e "$path") { foreach my $d (split(/\//,$path)) { ! $dir.=$d."/"; ! #print STDERR "DIR $dir"; if (! -e "$dir") { mkdir("$dir") --- 82,88 ---- my $self=shift; my $path=shift; if (! -e "$path") { foreach my $d (split(/\//,$path)) { ! my $dir.=$d."/"; if (! -e "$dir") { mkdir("$dir") *************** *** 101,112 **** my $timestamp = scalar localtime; chomp($timestamp); - #print STDERR "LOG FILE $logfile\n"; if($self->{LOGFILE}) { $logfile=$self->{LOGFILE}; my $path=$logfile; $path=~s/(.*)\/(.*)$/$1/; - #print STDERR "PATH: $path\n"; - - $self->mkrdir($path); --- 99,106 ---- *************** *** 123,127 **** $self->{LOGFILE}=$1; } ! open(LOG,">> $logfile") or die "ERROR $!\n"; print LOG "$timestamp $string"; --- 117,121 ---- $self->{LOGFILE}=$1; } ! open(LOG,">> $logfile") or die "ERROR $logfile $!\n"; print LOG "$timestamp $string"; |
|
From: Mojo N. <moj...@us...> - 2004-08-18 03:41:59
|
Update of /cvsroot/sprawler/sprawler/lib In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9674/lib Modified Files: Sprawler.pm Log Message: Added creating of necessary directories. Unfortunately master.pl fails on the first pass at creating master_indexes and log directories... on second try it works, something about trying to use the log file prior to the path being defined and it happens when the list needs to be seeded. I'll try to fix it shortly. mojo Index: Sprawler.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler.pm,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** Sprawler.pm 25 Jun 2004 02:31:05 -0000 1.9 --- Sprawler.pm 18 Aug 2004 03:41:37 -0000 1.10 *************** *** 23,26 **** --- 23,35 ---- $self->{$var}=$value; } + if ($var=~/PATH/) { + if( $self->{$var}=~m/ARRAY/) { + print STDERR "VAR $var <>$self->{$var} \n"; + + } else { + + $self->mkrdir($self->{$var}); + } + } } close(CONFIG); *************** *** 73,80 **** my $self=shift; my $path=shift; if (! -e "$path") { foreach my $d (split(/\//,$path)) { $dir.=$d."/"; ! if (! -e "$dir") { mkdir("$dir") || die "unable to create $dir\n" ; --- 82,91 ---- my $self=shift; my $path=shift; + if (! -e "$path") { foreach my $d (split(/\//,$path)) { $dir.=$d."/"; ! #print STDERR "DIR $dir"; ! if (! -e "$dir") { mkdir("$dir") || die "unable to create $dir\n" ; *************** *** 83,87 **** } } ! sub logf { --- 94,98 ---- } } ! sub logf { *************** *** 90,98 **** --- 101,114 ---- my $timestamp = scalar localtime; chomp($timestamp); + #print STDERR "LOG FILE $logfile\n"; if($self->{LOGFILE}) { $logfile=$self->{LOGFILE}; my $path=$logfile; $path=~s/(.*)\/(.*)$/$1/; + #print STDERR "PATH: $path\n"; + + $self->mkrdir($path); + } else { $logfile="/dev/null"; *************** *** 107,113 **** $self->{LOGFILE}=$1; } ! open(LOG,">> $logfile"); print LOG "$timestamp $string"; ! close; } --- 123,131 ---- $self->{LOGFILE}=$1; } ! open(LOG,">> $logfile") or die "ERROR $!\n"; ! print LOG "$timestamp $string"; ! ! close LOG; } |
|
From: Mojo N. <moj...@us...> - 2004-08-18 03:41:59
|
Update of /cvsroot/sprawler/sprawler/lib/Sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9674/lib/Sprawler Modified Files: Master.pm Log Message: Added creating of necessary directories. Unfortunately master.pl fails on the first pass at creating master_indexes and log directories... on second try it works, something about trying to use the log file prior to the path being defined and it happens when the list needs to be seeded. I'll try to fix it shortly. mojo Index: Master.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Master.pm,v retrieving revision 1.40 retrieving revision 1.41 diff -C2 -d -r1.40 -r1.41 *** Master.pm 25 Jun 2004 02:31:05 -0000 1.40 --- Master.pm 18 Aug 2004 03:41:37 -0000 1.41 *************** *** 59,63 **** $self->{URLPATHS}=[]; $self->{INDEX_EXT}=[]; - bless ($self, $class); return $self; --- 59,62 ---- *************** *** 335,338 **** --- 334,338 ---- my $fh=shift; my %db=(); + print STDERR "DBFILE $db_file\n"; my $db_obj = tie %db, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE; my $fd = $db_obj->fd; |
|
From: Mojo N. <moj...@us...> - 2004-08-18 03:41:58
|
Update of /cvsroot/sprawler/sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9674 Modified Files: db-perftest.pl master.pl Log Message: Added creating of necessary directories. Unfortunately master.pl fails on the first pass at creating master_indexes and log directories... on second try it works, something about trying to use the log file prior to the path being defined and it happens when the list needs to be seeded. I'll try to fix it shortly. mojo Index: db-perftest.pl =================================================================== RCS file: /cvsroot/sprawler/sprawler/db-perftest.pl,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** db-perftest.pl 3 Mar 2004 22:43:24 -0000 1.2 --- db-perftest.pl 18 Aug 2004 03:41:37 -0000 1.3 *************** *** 6,15 **** $pathtodbfiles = "/tmp/"; ! $maxrecords = 5000000; $hashdb = $pathtodbfiles . "hash.db"; $btreedb = $pathtodbfiles . "btree.db"; ! tie %testdb, 'DB_File', "$btreedb", O_RDWR|O_CREAT, 0644, $DB_BTREE; print "Starting db build..\n"; &builddb; --- 6,17 ---- $pathtodbfiles = "/tmp/"; ! $maxrecords = 3000000; $hashdb = $pathtodbfiles . "hash.db"; + #$hashdb = undef; $btreedb = $pathtodbfiles . "btree.db"; ! #$btreedb=undef; tie %testdb, 'DB_File', "$btreedb", O_RDWR|O_CREAT, 0644, $DB_BTREE; + #tie %testdb, 'DB_File', "$hashdb", O_RDWR|O_CREAT, 0644, $DB_HASH; print "Starting db build..\n"; &builddb; Index: master.pl =================================================================== RCS file: /cvsroot/sprawler/sprawler/master.pl,v retrieving revision 1.23 retrieving revision 1.24 diff -C2 -d -r1.23 -r1.24 *** master.pl 25 Jun 2004 02:31:04 -0000 1.23 --- master.pl 18 Aug 2004 03:41:37 -0000 1.24 *************** *** 56,62 **** --- 56,64 ---- $master->load($configfile); + $master->set("DEBUG",1); + ######################################################## # establish port and begin listening on it. # |
|
From: Mojo N. <moj...@us...> - 2004-06-25 02:31:14
|
Update of /cvsroot/sprawler/sprawler/lib/Sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20891/lib/Sprawler Modified Files: Client.pm Master.pm Log Message: You know you've been away to long when you forget how to commit to cvs Let's see add funct mkrdir make recursive directory for helping out with config file. added hack to checkin client to to clientdb. mojo Index: Master.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Master.pm,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** Master.pm 28 Mar 2004 02:18:09 -0000 1.39 --- Master.pm 25 Jun 2004 02:31:05 -0000 1.40 *************** *** 97,101 **** my $socket=shift; my $urls=shift; ! my $select=$self->{SELECT}; my $bindata=""; --- 97,101 ---- my $socket=shift; my $urls=shift; ! #my $select=$self->{SELECT}; my $bindata=""; *************** *** 257,260 **** --- 257,261 ---- my @urls=(); my $ii=0; + print STDERR " FILE $url_file_statezero\n"; my ($obj0, $fdesc0, $dbh0)=$self->open_db($url_file_statezero, "ZERO"); my ($obj1, $fdesc1, $dbh1)=$self->open_db($url_file_stateone, "ONE"); *************** *** 263,267 **** if (int(rand(1000)) < 3) { $dbh1->{$key}="$clientid"; ! # print "state 0 to 1 -> $key\n"; delete $dbh0->{$key}; push(@urls,$key); --- 264,268 ---- if (int(rand(1000)) < 3) { $dbh1->{$key}="$clientid"; ! print STDERR "state 0 to 1 -> $key\n"; delete $dbh0->{$key}; push(@urls,$key); *************** *** 368,371 **** --- 369,373 ---- my $request=shift; my $index_path=$self->get("INDEX_PATH"); + $self->mkrdir($index_path); my $clientenabled = undef; my %dbh0=(); *************** *** 373,378 **** my ($obj0, $fdesc0, $dbh0)=$self->open_db($client_checkoutdb, "CLIENTDB"); my $clientid_tag = $clientid . "-" . $request; ! my $junk = $dbh0->{$clientid_tag}; my $clientid_return=$dbh0->{$clientid_tag}; $self->close_db($obj0, $fdesc0, "CLIENTDB"); return ($clientid_return); --- 375,386 ---- my ($obj0, $fdesc0, $dbh0)=$self->open_db($client_checkoutdb, "CLIENTDB"); my $clientid_tag = $clientid . "-" . $request; ! #my $junk = $dbh0->{$clientid_tag}; ! if($dbh0->{$clientid_tag}) { ! ! } else { ! $dbh0->{$clientid_tag}=1; ! } my $clientid_return=$dbh0->{$clientid_tag}; + print STDERR "CLIENTID $clientid_return\n"; $self->close_db($obj0, $fdesc0, "CLIENTDB"); return ($clientid_return); Index: Client.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Client.pm,v retrieving revision 1.41 retrieving revision 1.42 diff -C2 -d -r1.41 -r1.42 *** Client.pm 1 Apr 2004 03:45:34 -0000 1.41 --- Client.pm 25 Jun 2004 02:31:05 -0000 1.42 *************** *** 564,568 **** $socket->recv($url, 1024); $socket->send("THANKS"); ! print "\rReceiving url $j of $urltotal"; $urls{$url}=1; $j++; --- 564,568 ---- $socket->recv($url, 1024); $socket->send("THANKS"); ! print "\rReceiving url $j of $urltotal URL $url"; $urls{$url}=1; $j++; |
|
From: Mojo N. <moj...@us...> - 2004-06-25 02:31:13
|
Update of /cvsroot/sprawler/sprawler/lib In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20891/lib Modified Files: Sprawler.pm Log Message: You know you've been away to long when you forget how to commit to cvs Let's see add funct mkrdir make recursive directory for helping out with config file. added hack to checkin client to to clientdb. mojo Index: Sprawler.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler.pm,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** Sprawler.pm 25 Mar 2004 04:40:36 -0000 1.8 --- Sprawler.pm 25 Jun 2004 02:31:05 -0000 1.9 *************** *** 70,73 **** --- 70,88 ---- } + sub mkrdir { + my $self=shift; + my $path=shift; + if (! -e "$path") { + foreach my $d (split(/\//,$path)) { + $dir.=$d."/"; + if (! -e "$dir") { + mkdir("$dir") + || die "unable to create $dir\n" ; + } + } + } + } + + sub logf { my $self=shift; *************** *** 76,80 **** chomp($timestamp); if($self->{LOGFILE}) { ! $logfile=$self->{LOGFILE} } else { $logfile="/dev/null"; --- 91,98 ---- chomp($timestamp); if($self->{LOGFILE}) { ! $logfile=$self->{LOGFILE}; ! my $path=$logfile; ! $path=~s/(.*)\/(.*)$/$1/; ! $self->mkrdir($path); } else { $logfile="/dev/null"; |
|
From: Mojo N. <moj...@us...> - 2004-06-25 02:31:13
|
Update of /cvsroot/sprawler/sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20891 Modified Files: master.pl Log Message: You know you've been away to long when you forget how to commit to cvs Let's see add funct mkrdir make recursive directory for helping out with config file. added hack to checkin client to to clientdb. mojo Index: master.pl =================================================================== RCS file: /cvsroot/sprawler/sprawler/master.pl,v retrieving revision 1.22 retrieving revision 1.23 diff -C2 -d -r1.22 -r1.23 *** master.pl 28 Mar 2004 01:49:40 -0000 1.22 --- master.pl 25 Jun 2004 02:31:04 -0000 1.23 *************** *** 56,60 **** $master->load($configfile); ! $master->set("DEBUG",0); --- 56,60 ---- $master->load($configfile); ! $master->set("DEBUG",1); *************** *** 119,135 **** my $action=$master->receive_request($socket,1024); if ($action =~ /^REQUEST_URL\s+(\d+)\s+(\S+)/) { ! my $qtyurls=$1; my $clientid=$2; ! my $retval=$master->check_clientid($clientid,"STATUS"); if ($retval eq "1") { if ($max_req_urls < $qtyurls) { $qtyurls = $max_req_urls; } $action=""; my @urls=$master->get_urls($qtyurls,$clientid); $master->logf("($line) child $child_id sending urls...\n"); $master->send_urls($socket,\@urls); $master->logf("ok\n"); } else { print STDERR "Client $clientid attempted to steal from us!\n"; } } elsif ($action =~ /^SEND_INDEX\s+(\S+)/) { --- 119,143 ---- my $action=$master->receive_request($socket,1024); + print STDERR "ACTION $action\n"; if ($action =~ /^REQUEST_URL\s+(\d+)\s+(\S+)/) { ! my $qtyurls=$1; my $clientid=$2; ! my $retval=undef; ! $retval=$master->check_clientid($clientid,"STATUS"); ! #$retval=1; if ($retval eq "1") { if ($max_req_urls < $qtyurls) { $qtyurls = $max_req_urls; } $action=""; my @urls=$master->get_urls($qtyurls,$clientid); + foreach my $url (@urls) { + print STDERR "URL\n"; + } $master->logf("($line) child $child_id sending urls...\n"); + print STDERR "($line) child $child_id sending urls...\n"; $master->send_urls($socket,\@urls); $master->logf("ok\n"); } else { print STDERR "Client $clientid attempted to steal from us!\n"; + exit; } } elsif ($action =~ /^SEND_INDEX\s+(\S+)/) { |
|
From: Mojo N. <moj...@us...> - 2004-04-01 03:57:27
|
Update of /cvsroot/sprawler/sprawler/lib/Sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9493/lib/Sprawler Modified Files: Client.pm Log Message: recursivly creates index directory. Index: Client.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Client.pm,v retrieving revision 1.40 retrieving revision 1.41 diff -C2 -d -r1.40 -r1.41 *** Client.pm 28 Mar 2004 01:49:40 -0000 1.40 --- Client.pm 1 Apr 2004 03:45:34 -0000 1.41 *************** *** 459,467 **** my $tmp_db=$self->get("URL_DB"); if (! -e "$index_path") { ! mkdir("$index_path") ! || die "unable to create $index_path\n" ; } ! my $urlhash=$self->checksum("$url"); my $db_file=$index_path.$urlhash."\.db"; --- 459,473 ---- my $tmp_db=$self->get("URL_DB"); + if (! -e "$index_path") { ! foreach my $d (split(/\//,$index_path)) { ! $dir.=$d."/"; ! if (! -e "$dir") { ! mkdir("$dir") ! || die "unable to create $dir\n" ; ! } ! } } ! my $urlhash=$self->checksum("$url"); my $db_file=$index_path.$urlhash."\.db"; |
|
From: Eric A. <de...@us...> - 2004-03-28 02:29:23
|
Update of /cvsroot/sprawler/sprawler/lib/Sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3627/lib/Sprawler Modified Files: Master.pm Log Message: Getting tired.. Index: Master.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Master.pm,v retrieving revision 1.38 retrieving revision 1.39 diff -C2 -d -r1.38 -r1.39 *** Master.pm 28 Mar 2004 02:15:44 -0000 1.38 --- Master.pm 28 Mar 2004 02:18:09 -0000 1.39 *************** *** 174,181 **** my $url=${$tmp_db}{URL}; if ($dbh1->{$url}) { ! my $stateone_clientid=$dbh1->{$url}; ! } else { ! my $stateone_clientid=""; } if ($stateone_clientid eq $clientid) { --- 174,180 ---- my $url=${$tmp_db}{URL}; + my $stateone_clientid=""; if ($dbh1->{$url}) { ! $stateone_clientid=$dbh1->{$url}; } if ($stateone_clientid eq $clientid) { |
|
From: Eric A. <de...@us...> - 2004-03-28 02:26:57
|
Update of /cvsroot/sprawler/sprawler/lib/Sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3262/lib/Sprawler Modified Files: Master.pm Log Message: - One more minor tweak/bug fix Index: Master.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Master.pm,v retrieving revision 1.37 retrieving revision 1.38 diff -C2 -d -r1.37 -r1.38 *** Master.pm 28 Mar 2004 01:49:40 -0000 1.37 --- Master.pm 28 Mar 2004 02:15:44 -0000 1.38 *************** *** 174,178 **** my $url=${$tmp_db}{URL}; ! my $stateone_clientid=$dbh1->{$url}; if ($stateone_clientid eq $clientid) { # client checking in indexed db- change url from stage 1 to stage 2 --- 174,182 ---- my $url=${$tmp_db}{URL}; ! if ($dbh1->{$url}) { ! my $stateone_clientid=$dbh1->{$url}; ! } else { ! my $stateone_clientid=""; ! } if ($stateone_clientid eq $clientid) { # client checking in indexed db- change url from stage 1 to stage 2 |
|
From: Eric A. <de...@us...> - 2004-03-28 02:21:00
|
Update of /cvsroot/sprawler/sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv2460 Added Files: reg_harvester.pl Log Message: - Added tool for registering new client, preregistering a default, and checking status of a client. --- NEW FILE: reg_harvester.pl --- #!/usr/bin/perl -w # Copyright (c) 2003, 2004 Sprawler Project # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # Alternatively, this software may be distributed under the terms of the # GNU General Public License ("GPL") version 2 as published by the Free # Software Foundation. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # $Id: reg_harvester.pl,v 1.1 2004/03/28 02:09:41 destari Exp $ use IO::Handle; use lib "./lib"; use Sprawler::Master; use DB_File; $email = shift; $preregister = shift; # open config file and read in settings my $configfile = "master.conf"; my $master=Sprawler::Master->new(); $master->load($configfile); if ($email =~ /\@.+\-\d+/) { if ($preregister) { &preregister_clientid($email,"STATUS"); } my $retval=$master->check_clientid($email,"STATUS"); print "Found -> $retval\n"; } else { ®ister_clientid($email,"STATUS"); } sub preregister_clientid { my $clientid=shift; my $request=shift; my $index_path=$master->get("INDEX_PATH"); my $clientid_tag = $clientid . "-$request"; my $client_checkoutdb = $index_path."client_checkout.db"; my ($obj0, $fdesc0, $dbh0)=$master->open_db($client_checkoutdb, "CLIENTDBPRE"); $dbh0->{$clientid_tag} = 1; print "Pre-registered clientid: $clientid clientid_tag: $clientid_tag\n"; $master->close_db($obj0, $fdesc0, "CLIENTDBPRE"); } sub register_clientid { my $clientemail=shift; my $request=shift; my $index_path=$master->get("INDEX_PATH"); my $time = time(); srand($time); my $random = int(rand(1000)); my $clientid = "$email-$random$time"; my $clientid_tag = $clientid . "-$request"; my $client_checkoutdb = $index_path."client_checkout.db"; my ($obj1, $fdesc1, $dbh1)=$master->open_db($client_checkoutdb, "CLIENTDB"); $dbh1->{$clientid_tag} = 1; print "Registered $email with clientid: $clientid clientid_tag: $clientid_tag\n"; $master->close_db($obj1, $fdesc1, "CLIENTDB"); } |
|
From: Eric A. <de...@us...> - 2004-03-28 02:00:53
|
Update of /cvsroot/sprawler/sprawler/lib/Sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv31736/lib/Sprawler Modified Files: Client.pm Master.pm Log Message: - Added routines to check for client validity. - Clients can now only check in url info for urls they themselves have checked out - Client must "register" first before being able to run. - Minor bug fixes and tweaks - some minor whitespace fixing - Config file changes (subtle) - Uses a default pre-registered clientid, but in the future, users will have to register their own. Index: Master.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Master.pm,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** Master.pm 25 Mar 2004 04:40:36 -0000 1.36 --- Master.pm 28 Mar 2004 01:49:40 -0000 1.37 *************** *** 114,117 **** --- 114,118 ---- my $self=shift; my $socket=shift; + my $clientid=shift; my $msg=undef; my $index_path=$self->{INDEX_PATH}; *************** *** 138,154 **** $filerecvsize = length $file; } ! print STDERR "RECEIVED index $filename\n"; ! print STDERR "Filesizes: $filesize ", length($file), "\n"; ! $self->logf("RECEIVED index $filename\n"); $self->logf("Filesizes: $filesize \n"); if($filesize == length($file)) { - $socket->send("0"); - } else { $socket->send("1"); } $db_file=$index_path.$filename; $tmp_file=$index_path.$filename.".mem"; if ( $tmp_file =~ /(.*)/ ) { ! $tmp_file=1; } --- 139,155 ---- $filerecvsize = length $file; } ! print STDERR "RECEIVED $filesize bytes of ", length($file), " in $filename\n"; ! $self->logf("RECEIVED index $filename from $clientid\n"); $self->logf("Filesizes: $filesize \n"); if($filesize == length($file)) { $socket->send("1"); + } else { + $socket->send("0"); } $db_file=$index_path.$filename; $tmp_file=$index_path.$filename.".mem"; + # what is this statement supposed to accomplish? if ( $tmp_file =~ /(.*)/ ) { ! $tmp_file=$1; } *************** *** 156,177 **** print INDEX $file; close INDEX; ! $tmp_db = retrieve("$tmp_file"); ! #foreach $key (keys %{$tmp_db}) { ! # print "KEY $key VALUE ${$tmp_db}{$key}"; ! #} ! print STDERR "WRITING index $db_file \n"; $self->logf("WRITING index $db_file\n"); unlink($tmp_file); my ($obj0,$fdesc0,$dbh0)=$self->open_db($db_file, "RECEIVE"); - %{$dbh0}=%{$tmp_db}; - $self->close_db($obj0, $fdesc0, "RECEIVE"); ! print STDERR "extracting new urls\n"; ! $self->logf("extracting new urls\n"); my @url=undef; @url=$self->extract_urls($db_file); ! $self->add_urls(\@url); } --- 157,199 ---- print INDEX $file; close INDEX; ! $tmp_db = retrieve("$tmp_file"); ! ! #print STDERR "WRITING index $db_file \n"; $self->logf("WRITING index $db_file\n"); unlink($tmp_file); + my ($obj0,$fdesc0,$dbh0)=$self->open_db($db_file, "RECEIVE"); ! my $url_file_stateone=$index_path . "master_urls_state1.db"; ! my ($obj1, $fdesc1, $dbh1)=$self->open_db($url_file_stateone, "ONE"); ! ! my $url_file_statetwo=$index_path . "master_urls_state2.db"; ! my ($obj2, $fdesc2, $dbh2)=$self->open_db($url_file_statetwo, "TWO"); ! ! my $url=${$tmp_db}{URL}; ! my $stateone_clientid=$dbh1->{$url}; ! if ($stateone_clientid eq $clientid) { ! # client checking in indexed db- change url from stage 1 to stage 2 ! delete $dbh1->{$url}; ! $dbh2->{$url}=$clientid; ! %{$dbh0}=%{$tmp_db}; ! $self->close_db($obj0, $fdesc0, "RECEIVE"); ! } else { ! # this client did not check out this URL! ! # quietly ignore the data ! $self->close_db($obj0, $fdesc0, "RECEIVE"); ! unlink($db_file); ! print STDERR "Client did not check out this url!: $url\n"; ! } ! $self->close_db($obj1, $fdesc1, "ONE"); ! $self->close_db($obj2, $fdesc2, "TWO"); ! ! print STDERR "Extracting new urls\n"; ! $self->logf("Extracting new urls\n"); my @url=undef; @url=$self->extract_urls($db_file); ! $self->add_urls(\@url,$clientid); } *************** *** 217,228 **** --- 239,255 ---- sub get_urls { + no strict 'refs'; + my $self=shift; my $n_urls=shift; + my $clientid=shift; my $r_urls=0; my $indexpath=$self->get("INDEX_PATH"); my $url_file_statezero=$indexpath . "master_urls_state0.db"; my $url_file_stateone=$indexpath . "master_urls_state1.db"; + my $client_checkoutdb=$indexpath . "client_checkout.db"; my %dbh0=(); my %dbh1=(); + my %dbh2=(); my @urls=(); my $ii=0; *************** *** 232,236 **** my ($key,$value) = each %{$dbh0}; if (int(rand(1000)) < 3) { ! $dbh1->{$key}="client id"; # print "state 0 to 1 -> $key\n"; delete $dbh0->{$key}; --- 259,263 ---- my ($key,$value) = each %{$dbh0}; if (int(rand(1000)) < 3) { ! $dbh1->{$key}="$clientid"; # print "state 0 to 1 -> $key\n"; delete $dbh0->{$key}; *************** *** 249,252 **** --- 276,280 ---- my $self=shift; my $urls=shift; + my $clientid=shift; my %url_db_0=(); my %url_db_1=(); *************** *** 265,278 **** foreach my $url (@{$urls}) { ! if(($dbh0->{$url}) || ($dbh1->{$url}) || ($dbh2->{$url}) || ($dbh3->{$url})) { # dont index again } else { # add it to the state 0 (to be indexed) db ! $dbh0->{$url}="client id"; $ii++; } $jj++; } ! $self->logf("added $ii (of $jj) to the master_urls_state0.db\n"); $self->close_db($obj0, $fdesc0, "ZERO"); $self->close_db($obj1, $fdesc1, "ONE"); --- 293,306 ---- foreach my $url (@{$urls}) { ! if((exists $dbh0->{$url}) || (exists $dbh1->{$url}) || (exists $dbh2->{$url}) || (exists $dbh3->{$url})) { # dont index again } else { # add it to the state 0 (to be indexed) db ! $dbh0->{$url}="$clientid"; $ii++; } $jj++; } ! $self->logf("added $ii (of $jj) to the master_urls_state0.db by client $clientid\n"); $self->close_db($obj0, $fdesc0, "ZERO"); $self->close_db($obj1, $fdesc1, "ONE"); *************** *** 302,306 **** my $db_file=shift; my $fh=shift; ! my %db={}; my $db_obj = tie %db, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE; my $fd = $db_obj->fd; --- 330,334 ---- my $db_file=shift; my $fh=shift; ! my %db=(); my $db_obj = tie %db, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE; my $fd = $db_obj->fd; *************** *** 311,315 **** open($fh, "$file") or die "fdopen $file $!"; ! # open($fh, "+<&=$fd") or die "fdopen $!"; unless (flock ($fh, LOCK_EX | LOCK_NB)) { unless (flock ($fh, LOCK_EX)) { die "flock: $!" } --- 339,343 ---- open($fh, "$file") or die "fdopen $file $!"; ! #open($fh, "+<&=$fd") or die "fdopen $!"; unless (flock ($fh, LOCK_EX | LOCK_NB)) { unless (flock ($fh, LOCK_EX)) { die "flock: $!" } *************** *** 330,333 **** --- 358,379 ---- } + sub check_clientid { + no strict 'refs'; + + my $self=shift; + my $clientid=shift; + my $request=shift; + my $index_path=$self->get("INDEX_PATH"); + my $clientenabled = undef; + my %dbh0=(); + my $client_checkoutdb = $index_path."client_checkout.db"; + my ($obj0, $fdesc0, $dbh0)=$self->open_db($client_checkoutdb, "CLIENTDB"); + my $clientid_tag = $clientid . "-" . $request; + my $junk = $dbh0->{$clientid_tag}; + my $clientid_return=$dbh0->{$clientid_tag}; + $self->close_db($obj0, $fdesc0, "CLIENTDB"); + return ($clientid_return); + } + sub local_index_dir { my $self=shift; Index: Client.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Client.pm,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** Client.pm 25 Mar 2004 05:44:04 -0000 1.39 --- Client.pm 28 Mar 2004 01:49:40 -0000 1.40 *************** *** 91,95 **** $self->{REQUEST_FILE}; $self->{MASTER_REQUEST}; ! $self->{CLIENTID}; $self->{SOCKET}=undef; --- 91,95 ---- $self->{REQUEST_FILE}; $self->{MASTER_REQUEST}; ! $self->{CLIENT_ID}; $self->{SOCKET}=undef; *************** *** 469,483 **** store $tmp_db, "$db_file\.tmp"; rename("$db_file\.tmp", "$db_file"); ! ! #tie %url_db, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE; ! #%url_db=%{$tmp_db}; ! #untie %url_db; ! # %{$tmp_db}=undef; ! # $tmp_db=undef; rename("$index_path/url.db.new","$index_path/url.db"); unlink "$index_path/url.db.new"; - # tie %db, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE; - return 1; } --- 469,476 ---- store $tmp_db, "$db_file\.tmp"; rename("$db_file\.tmp", "$db_file"); ! rename("$index_path/url.db.new","$index_path/url.db"); unlink "$index_path/url.db.new"; return 1; } *************** *** 502,510 **** my $self=shift; my $filename=shift; ! my $socket=shift;; ! my $index_path=$self->{INDEX_PATH}; my $file=undef; ! $socket->send("SEND_INDEX"); $socket->recv($msg,1024); if ($msg eq "READY") { --- 495,502 ---- my $self=shift; my $filename=shift; ! my $socket=shift; my $index_path=$self->{INDEX_PATH}; my $file=undef; ! $socket->send("SEND_INDEX $self->{CLIENT_ID}"); $socket->recv($msg,1024); if ($msg eq "READY") { *************** *** 549,552 **** --- 541,545 ---- my $self=shift; my $socket=shift; + my $clientid=$self->{CLIENT_ID}; my $urltotal=$self->{URLS_TO_INDEX}; my $index_path=$self->get("INDEX_PATH"); *************** *** 556,560 **** print "Requesting urls\n"; ! $socket->send("REQUEST_URL $urltotal"); #start waiting for $urls; #my $ii=0; --- 549,553 ---- print "Requesting urls\n"; ! $socket->send("REQUEST_URL $urltotal $clientid"); #start waiting for $urls; #my $ii=0; |
|
From: Eric A. <de...@us...> - 2004-03-28 02:00:52
|
Update of /cvsroot/sprawler/sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv31736 Modified Files: indexer.conf indexer.pl master.pl Log Message: - Added routines to check for client validity. - Clients can now only check in url info for urls they themselves have checked out - Client must "register" first before being able to run. - Minor bug fixes and tweaks - some minor whitespace fixing - Config file changes (subtle) - Uses a default pre-registered clientid, but in the future, users will have to register their own. Index: indexer.pl =================================================================== RCS file: /cvsroot/sprawler/sprawler/indexer.pl,v retrieving revision 1.26 retrieving revision 1.27 diff -C2 -d -r1.26 -r1.27 *** indexer.pl 25 Mar 2004 05:44:03 -0000 1.26 --- indexer.pl 28 Mar 2004 01:49:38 -0000 1.27 *************** *** 130,136 **** $totalurls=@urls; $| = 1; ! $k=0; foreach $url (@urls) { ! print "\rIndexing url $k of $totalurls"; $client->indexer($url); $client->flush_db($url); --- 130,136 ---- $totalurls=@urls; $| = 1; ! $k=1; foreach $url (@urls) { ! print "\rIndexing url $k of $totalurls "; $client->indexer($url); $client->flush_db($url); *************** *** 138,161 **** $k++; }; ! print " -> Done.\n"; opendir(INDEX_PATH,"$index_path"); ! @files=grep {! /^\./} readdir(INDEX_PATH); ! #my $socket=$client->connect($hostname); my $totalfiles=@files; $| = 1; ! $j = 0; foreach my $file (@files) { ! print "\rSending index $j of $totalfiles"; if($file =~ /[a-f0-9]{32}\.db/i) { $filetosend=$file; my $socket=$client->connect($hostname,$port); ! $client->send_index($filetosend,$socket); ! unlink("$index_path$filetosend"); ! $socket->close; } $j++; } ! print " -> Done.\n"; $| = 0; --- 138,164 ---- $k++; }; ! print "-> Done.\n"; opendir(INDEX_PATH,"$index_path"); ! #@files=grep {! /^\./} readdir(INDEX_PATH); ! @files=grep {/[a-f0-9]{32}\.db/} readdir(INDEX_PATH); my $totalfiles=@files; $| = 1; ! $j = 1; foreach my $file (@files) { ! print "\rSending index $j of $totalfiles "; if($file =~ /[a-f0-9]{32}\.db/i) { $filetosend=$file; my $socket=$client->connect($hostname,$port); ! if ($client->send_index($filetosend,$socket)) { ! unlink("$index_path$filetosend"); ! $socket->close; ! } else { ! print "\nERROR: sending file $filetosend.\n"; ! } } $j++; } ! print "-> Done.\n"; $| = 0; Index: master.pl =================================================================== RCS file: /cvsroot/sprawler/sprawler/master.pl,v retrieving revision 1.21 retrieving revision 1.22 diff -C2 -d -r1.21 -r1.22 *** master.pl 25 Mar 2004 04:40:35 -0000 1.21 --- master.pl 28 Mar 2004 01:49:40 -0000 1.22 *************** *** 66,70 **** ######################################################## ! $master->seed_urls();$master->logf("\n\n\nseeding urls...\n"); --- 66,71 ---- ######################################################## ! $master->seed_urls(); ! $master->logf("\n\n\nseeding urls...\n"); *************** *** 117,140 **** close PARENT; - #(my $parent_id, my $cc)=split(/\|/,$tmp); - #print "TMP $parent_id <> $cc\n"; my $action=$master->receive_request($socket,1024); ! if ($action =~ /^REQUEST_URL\s+(\d+)/) { ! my $qtyurls=$1; ! if ($max_req_urls < $qtyurls) { $qtyurls = $max_req_urls; } ! $action=""; ! my @urls=$master->get_urls($qtyurls); ! $master->logf("($line) child $child_id sending urls...\n"); ! $master->send_urls($socket,\@urls); ! $master->logf("ok\n"); ! # now we should mark the urls as "out for indexing", and save into hash ! # ! } elsif ($action eq "SEND_INDEX") { ! $action=""; ! $master->logf("($line) child $child_id recieving indexes...\n"); ! $master->receive_index($socket); ! $master->logf("ok\n"); } else { ! $master->logf("child recieved request $action\n"); } --- 118,149 ---- close PARENT; my $action=$master->receive_request($socket,1024); ! if ($action =~ /^REQUEST_URL\s+(\d+)\s+(\S+)/) { ! my $qtyurls=$1; ! my $clientid=$2; ! my $retval=$master->check_clientid($clientid,"STATUS"); ! if ($retval eq "1") { ! if ($max_req_urls < $qtyurls) { $qtyurls = $max_req_urls; } ! $action=""; ! my @urls=$master->get_urls($qtyurls,$clientid); ! $master->logf("($line) child $child_id sending urls...\n"); ! $master->send_urls($socket,\@urls); ! $master->logf("ok\n"); ! } else { ! print STDERR "Client $clientid attempted to steal from us!\n"; ! } ! } elsif ($action =~ /^SEND_INDEX\s+(\S+)/) { ! my $clientid = $1; ! my $retval=$master->check_clientid($clientid,"STATUS"); ! if ($retval eq "1") { ! $action=""; ! $master->logf("($line) child $child_id receiving indexes...\n"); ! $master->receive_index($socket,$clientid); ! $master->logf("ok\n"); ! } else { ! print STDERR "Client $clientid attempted to trick us!\n"; ! } } else { ! $master->logf("child received request $action\n"); } *************** *** 151,155 **** my $child; while ((my $waitedpid = waitpid(-1,WNOHANG)) > 0) { ! logmsg "reaped $waitedpid" . ($? ? " with exit $?" : ''); } $SIG{CHLD} = \&REAPER; # loathe sysV --- 160,164 ---- my $child; while ((my $waitedpid = waitpid(-1,WNOHANG)) > 0) { ! #logmsg "reaped $waitedpid" . ($? ? " with exit $?" : ''); } $SIG{CHLD} = \&REAPER; # loathe sysV Index: indexer.conf =================================================================== RCS file: /cvsroot/sprawler/sprawler/indexer.conf,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** indexer.conf 25 Mar 2004 04:40:35 -0000 1.8 --- indexer.conf 28 Mar 2004 01:49:38 -0000 1.9 *************** *** 9,13 **** URLS_TO_INDEX = 20 ! CLIENT_ID = TESTER1 DEFAULT_SERVER = beta.sprawler.com DEFAULT_SERVER_PORT = 5555 --- 9,13 ---- URLS_TO_INDEX = 20 ! CLIENT_ID = tes...@sp...-1031080407379 DEFAULT_SERVER = beta.sprawler.com DEFAULT_SERVER_PORT = 5555 *************** *** 15,19 **** # interval in minutes reindex_interval = 1440 ! INDEX_TYPES = text/html ! #index_ext = html,txt --- 15,18 ---- # interval in minutes reindex_interval = 1440 ! INDEX_TYPES = text/html text/plain |
|
From: Eric A. <de...@us...> - 2004-03-25 05:55:31
|
Update of /cvsroot/sprawler/sprawler/lib In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv2943/lib Modified Files: Extract.pm Log Message: - changed indexer output to a more clean format, now that the indexer is "settling" down a bit. Less verbose, but still tells the story. Index: Extract.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Extract.pm,v retrieving revision 1.16 retrieving revision 1.17 diff -C2 -d -r1.16 -r1.17 *** Extract.pm 25 Mar 2004 04:40:36 -0000 1.16 --- Extract.pm 25 Mar 2004 05:44:03 -0000 1.17 *************** *** 231,235 **** $p->parse($text); $n_links=@links; ! print "found $n_links links on $url .....\n"; return (\@links, \@emails); } --- 231,235 ---- $p->parse($text); $n_links=@links; ! #print "found $n_links links on $url ..\n"; return (\@links, \@emails); } |
|
From: Eric A. <de...@us...> - 2004-03-25 05:54:58
|
Update of /cvsroot/sprawler/sprawler/lib/Sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv2943/lib/Sprawler Modified Files: Client.pm Log Message: - changed indexer output to a more clean format, now that the indexer is "settling" down a bit. Less verbose, but still tells the story. Index: Client.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Client.pm,v retrieving revision 1.38 retrieving revision 1.39 diff -C2 -d -r1.38 -r1.39 *** Client.pm 25 Mar 2004 05:06:45 -0000 1.38 --- Client.pm 25 Mar 2004 05:44:04 -0000 1.39 *************** *** 515,519 **** } $filesize=length $file; ! print "SENDING $filename ($filesize)\n"; $socket->send($filename); --- 515,519 ---- } $filesize=length $file; ! #print "SENDING $filename ($filesize)\n"; $socket->send($filename); *************** *** 529,538 **** $start = 1024 * $counter; $buffer = substr($file, $start, 1024); ! print "#"; $socket->send($buffer); $counter++; $datasent = $counter * 1024; } ! print "\n"; $socket->recv("$msg",1024); if($msg eq "0") { --- 529,538 ---- $start = 1024 * $counter; $buffer = substr($file, $start, 1024); ! #print "#"; $socket->send($buffer); $counter++; $datasent = $counter * 1024; } ! #print "\n"; $socket->recv("$msg",1024); if($msg eq "0") { *************** *** 555,569 **** tie %urls, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE; ! print "requesting urls\n"; $socket->send("REQUEST_URL $urltotal"); #start waiting for $urls; #my $ii=0; my $url=undef; while($url ne "COMPLETE") { $socket->recv($url, 1024); $socket->send("THANKS"); ! print "RECEIVING $url\n"; $urls{$url}=1; } delete $urls{"COMPLETE"}; delete $urls{""}; --- 555,573 ---- tie %urls, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE; ! print "Requesting urls\n"; $socket->send("REQUEST_URL $urltotal"); #start waiting for $urls; #my $ii=0; my $url=undef; + my $j=0; + $|=1; while($url ne "COMPLETE") { $socket->recv($url, 1024); $socket->send("THANKS"); ! print "\rReceiving url $j of $urltotal"; $urls{$url}=1; + $j++; } + print " -> Done.\n"; delete $urls{"COMPLETE"}; delete $urls{""}; |
|
From: Eric A. <de...@us...> - 2004-03-25 05:54:52
|
Update of /cvsroot/sprawler/sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv2943 Modified Files: indexer.pl Log Message: - changed indexer output to a more clean format, now that the indexer is "settling" down a bit. Less verbose, but still tells the story. Index: indexer.pl =================================================================== RCS file: /cvsroot/sprawler/sprawler/indexer.pl,v retrieving revision 1.25 retrieving revision 1.26 diff -C2 -d -r1.25 -r1.26 *** indexer.pl 25 Mar 2004 04:40:35 -0000 1.25 --- indexer.pl 25 Mar 2004 05:44:03 -0000 1.26 *************** *** 128,143 **** if (@urls) { #print "indexing urls\n"; foreach $url (@urls) { ! #print "Indexing $url\n"; $client->indexer($url); $client->flush_db($url); $client->remove_url($url); }; ! ! print "Sending indexes\n"; opendir(INDEX_PATH,"$index_path"); @files=grep {! /^\./} readdir(INDEX_PATH); #my $socket=$client->connect($hostname); foreach my $file (@files) { if($file =~ /[a-f0-9]{32}\.db/i) { $filetosend=$file; --- 128,151 ---- if (@urls) { #print "indexing urls\n"; + $totalurls=@urls; + $| = 1; + $k=0; foreach $url (@urls) { ! print "\rIndexing url $k of $totalurls"; $client->indexer($url); $client->flush_db($url); $client->remove_url($url); + $k++; }; ! print " -> Done.\n"; ! opendir(INDEX_PATH,"$index_path"); @files=grep {! /^\./} readdir(INDEX_PATH); #my $socket=$client->connect($hostname); + my $totalfiles=@files; + $| = 1; + $j = 0; foreach my $file (@files) { + print "\rSending index $j of $totalfiles"; if($file =~ /[a-f0-9]{32}\.db/i) { $filetosend=$file; *************** *** 147,151 **** --- 155,162 ---- $socket->close; } + $j++; } + print " -> Done.\n"; + $| = 0; |
|
From: Eric A. <de...@us...> - 2004-03-25 05:17:29
|
Update of /cvsroot/sprawler/sprawler/lib/Sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv30378/lib/Sprawler Modified Files: Client.pm Log Message: - added title logging, fixed some items that were missed. Index: Client.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Client.pm,v retrieving revision 1.37 retrieving revision 1.38 diff -C2 -d -r1.37 -r1.38 *** Client.pm 25 Mar 2004 04:40:36 -0000 1.37 --- Client.pm 25 Mar 2004 05:06:45 -0000 1.38 *************** *** 74,82 **** $self->{CONTENT_TYPE}=undef; $self->{TITLEWORDS}=[]; $self->{BOLDWORDS}=[]; $self->{ITALICWORDS}=[]; $self->{HTMLWORDS}=[]; $self->{SHORTWORDS}=[]; ! $self->{STOPWORDS}={}; --- 74,89 ---- $self->{CONTENT_TYPE}=undef; $self->{TITLEWORDS}=[]; + $self->{TITLE}=undef; $self->{BOLDWORDS}=[]; $self->{ITALICWORDS}=[]; $self->{HTMLWORDS}=[]; $self->{SHORTWORDS}=[]; ! $self->{URLS}=[]; ! $self->{EMAILS}=[]; ! $self->{PREWORDS}=[]; ! $self->{LIWORDS}=[]; ! $self->{HEADERWORDS}=[]; ! $self->{MARQUEEWORDS}=[]; ! $self->{STOPWORDS}={}; *************** *** 335,343 **** } $parser->set_tag('title'); ! my $title = $parser -> extract_text ("$doctext"); @titlewords=undef; @{$self->{TITLEWORDS}}=undef; ! @titlewords=$self->clean($parser->get_words($parser -> extract_text ("$doctext"))); @{$self->{TITLEWORDS}}=@titlewords; --- 342,351 ---- } $parser->set_tag('title'); ! my $title = $parser->extract_text("$doctext"); @titlewords=undef; @{$self->{TITLEWORDS}}=undef; ! $self->{TITLE}=$title; ! @titlewords=$self->clean($parser->get_words($self->{TITLE})); @{$self->{TITLEWORDS}}=@titlewords; *************** *** 398,401 **** --- 406,410 ---- my @indextypes = ("TITLEWORDS", "BOLDWORDS", "ITALICWORDS", "URLS", "EMAILS", "PREWORDS", "LIWORDS", "HEADERWORDS", "MARQUEEWORDS"); ${$self->{URL_DB}}{URL} = $document; + ${$self->{URL_DB}}{TITLE} = $self->{TITLE}; foreach my $indextype (@indextypes) { ${$self->{URL_DB}}{$indextype} = undef; |
|
From: Eric A. <de...@us...> - 2004-03-25 04:51:20
|
Update of /cvsroot/sprawler/sprawler/lib/Sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26241/lib/Sprawler Modified Files: Client.pm Master.pm Log Message: - added function from Ilya to check headers for content types - small bug fixes - other little stuff Index: Master.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Master.pm,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** Master.pm 20 Mar 2004 04:26:16 -0000 1.35 --- Master.pm 25 Mar 2004 04:40:36 -0000 1.36 *************** *** 302,306 **** my $db_file=shift; my $fh=shift; ! my %db=(); my $db_obj = tie %db, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE; my $fd = $db_obj->fd; --- 302,306 ---- my $db_file=shift; my $fh=shift; ! my %db={}; my $db_obj = tie %db, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE; my $fd = $db_obj->fd; *************** *** 308,312 **** if ($file =~ /(\+\<\&)\=(.*)/g) { $file=$1."=".$2; ! } open($fh, "$file") or die "fdopen $file $!"; --- 308,312 ---- if ($file =~ /(\+\<\&)\=(.*)/g) { $file=$1."=".$2; ! } open($fh, "$file") or die "fdopen $file $!"; Index: Client.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Client.pm,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** Client.pm 16 Mar 2004 23:18:31 -0000 1.36 --- Client.pm 25 Mar 2004 04:40:36 -0000 1.37 *************** *** 68,75 **** $self->{DOCPATHS}=[]; $self->{URLPATHS}=[]; ! $self->{INDEX_EXT}=[]; $self->{LANGUAGES}=[]; $self->{CHECKOUTDIR}=[]; $self->{TITLEWORDS}=[]; $self->{BOLDWORDS}=[]; --- 68,76 ---- $self->{DOCPATHS}=[]; $self->{URLPATHS}=[]; ! $self->{INDEX_TYPES}=[]; $self->{LANGUAGES}=[]; $self->{CHECKOUTDIR}=[]; + $self->{CONTENT_TYPE}=undef; $self->{TITLEWORDS}=[]; $self->{BOLDWORDS}=[]; *************** *** 301,307 **** my $document=shift; my @array=undef; my $doctext=LWP::Simple::get($document); - #my $baseurl=undef; - #my $domainbase=undef; chomp $document; --- 302,308 ---- my $document=shift; my @array=undef; + my @docheader=LWP::Simple::head($document); + return if (!(@docheader)); #document unavailable my $doctext=LWP::Simple::get($document); chomp $document; *************** *** 328,332 **** $docsize=length($doctext); ! $parser->set_tag('title'); my $title = $parser -> extract_text ("$doctext"); --- 329,337 ---- $docsize=length($doctext); ! $self->{CONTENT_TYPE} = $parser -> extract_header("@docheader"); ! if (grep {! /^$self->{CONTENT_TYPE}/i} @{$self->{INDEX_TYPES}}) { ! print "Skipping $document - content type: $self->{CONTENT_TYPE}\n"; ! return; ! } $parser->set_tag('title'); my $title = $parser -> extract_text ("$doctext"); |
|
From: Eric A. <de...@us...> - 2004-03-25 04:51:20
|
Update of /cvsroot/sprawler/sprawler/lib In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26241/lib Modified Files: Extract.pm Sprawler.pm Log Message: - added function from Ilya to check headers for content types - small bug fixes - other little stuff Index: Extract.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Extract.pm,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** Extract.pm 14 Mar 2004 05:54:25 -0000 1.15 --- Extract.pm 25 Mar 2004 04:40:36 -0000 1.16 *************** *** 116,119 **** --- 116,127 ---- } + sub extract_header ($@) { + my $self = shift; + my @header = shift; + my $ctype = (split /\s+/,$header[0])[0]; + $ctype =~ s/\;//; + return $ctype; + } + sub extract_text ($$) { my $self = shift; *************** *** 148,152 **** # clean up anchors and relative paths,etc, here. # need to deal with ../ ! if ($link =~ /^(.+)\#/o) { # anchor reference $link = $1; --- 156,160 ---- # clean up anchors and relative paths,etc, here. # need to deal with ../ ! if ($link && $link =~ /^(.+)\#/o) { # anchor reference $link = $1; *************** *** 202,221 **** # slip through the cracks? $normalized_url = $baseurl . $link; - print "XXX MISSED $url XXX\n"; } if ($normalized_url) { - # print "--->> $normalized_url\n"; for my $c (split(//, $link)) { ! $o=ord($c); ! if ($o<128){ ! $new_link.=$c; ! } else { ! $new_link=""; ! last; ! } } - - push(@links, $normalized_url); } --- 210,225 ---- # slip through the cracks? $normalized_url = $baseurl . $link; } if ($normalized_url) { for my $c (split(//, $link)) { ! $o=ord($c); ! if ($o<128){ ! $new_link.=$c; ! } else { ! $new_link=""; ! last; ! } } push(@links, $normalized_url); } Index: Sprawler.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler.pm,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** Sprawler.pm 15 Mar 2004 05:15:12 -0000 1.7 --- Sprawler.pm 25 Mar 2004 04:40:36 -0000 1.8 *************** *** 85,92 **** $string=sprintf("$format"); } if ($self->{LOGFILE}=~/(.*)/) { $self->{LOGFILE}=$1; } ! open(LOG,">> $self->{LOGFILE}"); print LOG "$timestamp $string"; close; --- 85,93 ---- $string=sprintf("$format"); } + # This looks redundant: if ($self->{LOGFILE}=~/(.*)/) { $self->{LOGFILE}=$1; } ! open(LOG,">> $logfile"); print LOG "$timestamp $string"; close; |
|
From: Eric A. <de...@us...> - 2004-03-25 04:51:20
|
Update of /cvsroot/sprawler/sprawler/docs In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26241/docs Modified Files: to-do.txt Log Message: - added function from Ilya to check headers for content types - small bug fixes - other little stuff Index: to-do.txt =================================================================== RCS file: /cvsroot/sprawler/sprawler/docs/to-do.txt,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** to-do.txt 15 Mar 2004 21:32:24 -0000 1.5 --- to-do.txt 25 Mar 2004 04:40:35 -0000 1.6 *************** *** 8,24 **** strong text, etc, etc) - o client needs to know what content-type we are getting, and decide to - download or not - otherwise, we end up downloading large binary files - and realizing they are not html (I think the web server can tell us if - it's text/html, or whatever) - o fix pick_lanquage method (Eric) o test and select an html parser (HTML:Parser,XML::Parser, ! TokeParser, Pull Parser) based on efficency (open). ! ! o make method in Extractor to parse header info (open) ! o methods for determining font clashes (ask Eric, open) --- 8,17 ---- strong text, etc, etc) o fix pick_lanquage method (Eric) o test and select an html parser (HTML:Parser,XML::Parser, ! TokeParser, Pull Parser) based on efficency (Ilya). ! o methods for determining font clashes (open) *************** *** 62,65 **** --- 55,65 ---- Recently Completed: ----------------- + o make method in Extractor to parse header info (Ilya) + + o client needs to know what content-type we are getting, and decide to + download or not - otherwise, we end up downloading large binary files + and realizing they are not html (I think the web server can tell us if + it's text/html, or whatever) (Ilya) + o added command line operations to indexer (client) to select config file, server name, server port, client id. (Eric) |
|
From: Eric A. <de...@us...> - 2004-03-25 04:51:20
|
Update of /cvsroot/sprawler/sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26241 Modified Files: indexer.conf indexer.pl master.pl Log Message: - added function from Ilya to check headers for content types - small bug fixes - other little stuff Index: indexer.pl =================================================================== RCS file: /cvsroot/sprawler/sprawler/indexer.pl,v retrieving revision 1.24 retrieving revision 1.25 diff -C2 -d -r1.24 -r1.25 *** indexer.pl 14 Mar 2004 05:54:25 -0000 1.24 --- indexer.pl 25 Mar 2004 04:40:35 -0000 1.25 *************** *** 93,97 **** @urlpaths=$client->get("URLPATHS"); $reindex_interval=$client->get("REINDEX_INTERVAL"); ! @index_ext=$client->get("INDEX_EXT"); $contexts=$client->get("CONTEXTS"); $cachesize=$client->get("MAXCACHEDSIZE"); --- 93,97 ---- @urlpaths=$client->get("URLPATHS"); $reindex_interval=$client->get("REINDEX_INTERVAL"); ! @index_ext=$client->get("INDEX_TYPES"); $contexts=$client->get("CONTEXTS"); $cachesize=$client->get("MAXCACHEDSIZE"); *************** *** 99,107 **** ! print "index path: $index_path\n" if $debug; #print "document paths: @docpaths\n" if $debug; #print "url locations: @urlpaths\n" if $debug; #print "reindex interval (mins): $reindex_interval\n" if $debug; ! print "indexable extensions: @index_ext\n" if $debug; #print "known languages: @languages\n" if $debug; --- 99,107 ---- ! print "Index path: $index_path\n" if $debug; #print "document paths: @docpaths\n" if $debug; #print "url locations: @urlpaths\n" if $debug; #print "reindex interval (mins): $reindex_interval\n" if $debug; ! print "Indexable content types: @index_ext\n" if $debug; #print "known languages: @languages\n" if $debug; *************** *** 129,133 **** #print "indexing urls\n"; foreach $url (@urls) { ! #print "indexing $url\n"; $client->indexer($url); $client->flush_db($url); --- 129,133 ---- #print "indexing urls\n"; foreach $url (@urls) { ! #print "Indexing $url\n"; $client->indexer($url); $client->flush_db($url); *************** *** 135,139 **** }; ! print "sending indexes\n"; opendir(INDEX_PATH,"$index_path"); @files=grep {! /^\./} readdir(INDEX_PATH); --- 135,139 ---- }; ! print "Sending indexes\n"; opendir(INDEX_PATH,"$index_path"); @files=grep {! /^\./} readdir(INDEX_PATH); Index: master.pl =================================================================== RCS file: /cvsroot/sprawler/sprawler/master.pl,v retrieving revision 1.20 retrieving revision 1.21 diff -C2 -d -r1.20 -r1.21 *** master.pl 15 Mar 2004 05:15:12 -0000 1.20 --- master.pl 25 Mar 2004 04:40:35 -0000 1.21 *************** *** 1,3 **** ! #!/usr/bin/perl -wT --- 1,3 ---- ! #!/usr/bin/perl -w Index: indexer.conf =================================================================== RCS file: /cvsroot/sprawler/sprawler/indexer.conf,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** indexer.conf 10 Mar 2004 05:42:26 -0000 1.7 --- indexer.conf 25 Mar 2004 04:40:35 -0000 1.8 *************** *** 15,19 **** # interval in minutes reindex_interval = 1440 ! index_ext = html #index_ext = html,txt --- 15,19 ---- # interval in minutes reindex_interval = 1440 ! INDEX_TYPES = text/html #index_ext = html,txt |
|
From: Eric A. <de...@us...> - 2004-03-20 04:36:10
|
Update of /cvsroot/sprawler/sprawler/lib/Sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv21944/lib/Sprawler Modified Files: Master.pm Log Message: minor bugfixes. still hunting for the big one Index: Master.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Master.pm,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** Master.pm 16 Mar 2004 04:19:47 -0000 1.34 --- Master.pm 20 Mar 2004 04:26:16 -0000 1.35 *************** *** 191,197 **** my $seed_file="./url_seed.txt"; my @urls=(); - my ($obj0, $fdesc0, $dbh0)=$self->open_db($url_file_statezero, "ZERO"); ! if (!(my ($key,$value) = each %{$dbh0})) { print "No urls in db to index! Seeding url index ...\n"; open(SEEDFILE, "$seed_file"); --- 191,198 ---- my $seed_file="./url_seed.txt"; my @urls=(); ! ! if (!(-e $url_file_statezero)) { ! my ($obj0, $fdesc0, $dbh0)=$self->open_db($url_file_statezero, "ZERO"); print "No urls in db to index! Seeding url index ...\n"; open(SEEDFILE, "$seed_file"); *************** *** 209,216 **** my $urlsadded = @urls; print "$urlsadded urls added to seed list.\n"; } else { print "Using original seed list.\n" } - $self->close_db($obj0, $fdesc0, "ZERO"); } --- 210,217 ---- my $urlsadded = @urls; print "$urlsadded urls added to seed list.\n"; + $self->close_db($obj0, $fdesc0, "ZERO"); } else { print "Using original seed list.\n" } } |
|
From: Eric A. <de...@us...> - 2004-03-16 23:28:22
|
Update of /cvsroot/sprawler/sprawler/docs In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv7756/docs Modified Files: how-to-index.txt Log Message: Added more functions to client.pm (indexes more types of data) Index: how-to-index.txt =================================================================== RCS file: /cvsroot/sprawler/sprawler/docs/how-to-index.txt,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** how-to-index.txt 15 Mar 2004 21:32:24 -0000 1.2 --- how-to-index.txt 16 Mar 2004 23:18:30 -0000 1.3 *************** *** 17,70 **** the "offline" index. ! Here's a list of info we need to grab from the page: ! Title <TITLE> ! Headings (text in <H1>, <H2>, and so on) ! Bold words <B> ! Large text <font size=+x> ! Italic words <i> ! Underlined words <ul> ! Linked words <a ..> ! Capitalized words (like This and like THIS) ! Words linked from other pages to the current page ! Word quantity ! word proximity - how close is one word to another ! size of page (in bytes) ! link/non-link text ratio - if half the page is links, how much content can there really be? ! URL of the page - including domain name ! Is the text at the top of the page more important than text at the bottom? ! If text color is same as background color - it's probably search engine fodder <font color=..> ! How many pages link to this page, and which pages ! phone numbers (international ones, too) ! addresses ! email addresses ! domain names ! product numbers/model numbers ! ISBN book numbers (there's and algorithm for this) ! company names ! meta description <meta desc..> ! meta keywords <meta keywords..> ! meta expires <meta expires..> ! filenames ! postal/zip codes ! stock symbols ! abbreviations for province/state names ! em tagged words ! blinking words <blink> ! marquee words <marquee> ! small font words <font size=-x..> ! table headers <???> ! words in table data tags <td>...</td> ! alt tags (for commenting images) <a ... alt=xxxx> ! image file names <img src=xxxx > ! quoted words <??> ! block text quoted words <block> ! listed text words <li> ! preformatted text <pre> ! text/image ratio ! individual words, and their frequency ! phrases (Panama canal routine) ! size of entire file ! size of data after html removed ! text/html ratio --- 17,70 ---- the "offline" index. ! Here's a list of info we need to grab from the page: (- is to do, * is done, and ? is unknown state) ! * Title <TITLE> ! * Headings (text in <H1>, <H2>, and so on) ! * Bold words <B> ! - Large text <font size=+x> ! * Italic words <i> ! - Underlined words <ul> ! ? Linked words <a ..> ! - Capitalized words (like This and like THIS) ! - Words linked from other pages to the current page ! - Word quantity ! - word proximity - how close is one word to another ! ? size of page (in bytes) ! - link/non-link text ratio - if half the page is links, how much content can there really be? ! * URL of the page - including domain name ! - Is the text at the top of the page more important than text at the bottom? ! - If text color is same as background color - it's probably search engine fodder <font color=..> ! - How many pages link to this page, and which pages ! - phone numbers (international ones, too) ! - addresses (snail mail) ! * email addresses ! - domain names ! - product numbers/model numbers ! - ISBN book numbers (there's and algorithm for this) ! - company names ! - meta description <meta desc..> ! - meta keywords <meta keywords..> ! - meta expires <meta expires..> ! - filenames ! - postal/zip codes ! - stock symbols ! - abbreviations for province/state names ! - em tagged words <em> ! - blinking words <blink> ! * marquee words <marquee> ! - small font words <font size=-x..> ! - table headers <???> ! - words in table data tags <td>...</td> ! - alt tags (for commenting images) <a ... alt=xxxx> ! - image file names <img src=xxxx > ! - quoted words <??> ! * block text quoted words <block> ! * listed text words <li> ! * preformatted text <pre> ! - text/image ratio ! - individual words, and their frequency ! - phrases (Panama canal routine) ! - size of entire file ! - size of data after html removed ! - text/html ratio |
|
From: Eric A. <de...@us...> - 2004-03-16 23:28:06
|
Update of /cvsroot/sprawler/sprawler/lib/Sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv7756/lib/Sprawler Modified Files: Client.pm Log Message: Added more functions to client.pm (indexes more types of data) Index: Client.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Client.pm,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** Client.pm 10 Mar 2004 05:42:27 -0000 1.35 --- Client.pm 16 Mar 2004 23:18:31 -0000 1.36 *************** *** 337,340 **** --- 337,367 ---- @{$self->{TITLEWORDS}}=@titlewords; + @header=undef; + @{$self->{HEADERWORDS}}=undef; + $parser->set_tag('h1'); #header words + @headerwords=$self->clean($parser->get_words($parser -> extract_text ("$doctext"))); + @{$self->{HEADERWORDS}}=@headerwords; + $parser->set_tag('h2'); + @headerwords=$self->clean($parser->get_words($parser -> extract_text ("$doctext"))); + push(@{$self->{HEADERWORDS}},@headerwords); + + @marqueewords=undef; + @{$self->{MARQUEEWORDS}}=undef; + $parser->set_tag('marquee'); #marquee words + @marqueewords=$self->clean($parser->get_words($parser -> extract_text ("$doctext"))); + @{$self->{MARQUEEWORDS}}=@marqueewords; + + @prewords=undef; + @{$self->{PREWORDS}}=undef; + $parser->set_tag('pre'); #"pre"formatted words + @prewords=$self->clean($parser->get_words($parser -> extract_text ("$doctext"))); + @{$self->{PREWORDS}}=@prewords; + + @liwords=undef; + @{$self->{LIWORDS}}=undef; + $parser->set_tag('li'); #"li"ne words + @liwords=$self->clean($parser->get_words($parser -> extract_text ("$doctext"))); + @{$self->{LIWORDS}}=@liwords; + @boldwords=undef; @{$self->{BOLDWORDS}}=undef; *************** *** 364,368 **** ! my @indextypes = ("TITLEWORDS", "BOLDWORDS", "ITALICWORDS","URLS","EMAILS"); ${$self->{URL_DB}}{URL} = $document; foreach my $indextype (@indextypes) { --- 391,395 ---- ! my @indextypes = ("TITLEWORDS", "BOLDWORDS", "ITALICWORDS", "URLS", "EMAILS", "PREWORDS", "LIWORDS", "HEADERWORDS", "MARQUEEWORDS"); ${$self->{URL_DB}}{URL} = $document; foreach my $indextype (@indextypes) { |
|
From: Mojo N. <moj...@us...> - 2004-03-16 04:29:01
|
Update of /cvsroot/sprawler/sprawler/lib/Sprawler In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv23415/lib/Sprawler Modified Files: Master.pm Log Message: reposting commit. Index: Master.pm =================================================================== RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Master.pm,v retrieving revision 1.33 retrieving revision 1.34 diff -C2 -d -r1.33 -r1.34 *** Master.pm 15 Mar 2004 05:15:12 -0000 1.33 --- Master.pm 16 Mar 2004 04:19:47 -0000 1.34 *************** *** 149,152 **** --- 149,155 ---- $db_file=$index_path.$filename; $tmp_file=$index_path.$filename.".mem"; + if ( $tmp_file =~ /(.*)/ ) { + $tmp_file=1; + } open(INDEX, "> $tmp_file") or die "NO FILE $!\n"; |