sprawler-commits Mailing List for Sprawler

Status: Pre-Alpha

Brought to you by: destari, mojobnichols

sprawler-commits — Mailing list notifications of CVS commit changes.

You can subscribe to this list here.

2003	_Jan	_Feb	_Mar	_Apr	_May	_Jun	_Jul	_Aug	_Sep (7)	_Oct (1)	_Nov	_Dec
2004	_Jan (41)	_Feb (40)	_Mar (55)	_Apr (1)	_May	_Jun (3)	_Jul	_Aug (4)	_Sep	_Oct	_Nov	_Dec

Flat | Threaded

1 2 3 .. 7 > >> (Page 1 of 7)

[Sprawler-commits] sprawler/lib Sprawler.pm,1.10,1.11

From: Mojo N. <moj...@us...> - 2004-08-18 03:51:36

Update of /cvsroot/sprawler/sprawler/lib
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv11522/lib

Modified Files:
	Sprawler.pm 
Log Message:
It was simply a globablly scope variable $dir. Locally scoping it fixed it.


Mojo




Index: Sprawler.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler.pm,v
retrieving revision 1.10
retrieving revision 1.11
diff -C2 -d -r1.10 -r1.11
*** Sprawler.pm	18 Aug 2004 03:41:37 -0000	1.10
--- Sprawler.pm	18 Aug 2004 03:51:25 -0000	1.11
***************
*** 82,90 ****
    my $self=shift;
    my $path=shift;
-   
    if (! -e "$path") {
      foreach my $d (split(/\//,$path)) {
!       $dir.=$d."/";
!       #print STDERR "DIR $dir";
       if (! -e "$dir") {
  	mkdir("$dir") 
--- 82,88 ----
    my $self=shift;
    my $path=shift;
    if (! -e "$path") {
      foreach my $d (split(/\//,$path)) {
!       my $dir.=$d."/";
       if (! -e "$dir") {
  	mkdir("$dir") 
***************
*** 101,112 ****
    my $timestamp = scalar localtime;
    chomp($timestamp);
-   #print STDERR "LOG FILE $logfile\n";
    if($self->{LOGFILE}) {
      $logfile=$self->{LOGFILE};
      my $path=$logfile;
      $path=~s/(.*)\/(.*)$/$1/;
-     #print STDERR "PATH: $path\n";
- 
- 
      $self->mkrdir($path);
  
--- 99,106 ----
***************
*** 123,127 ****
      $self->{LOGFILE}=$1;
    }
!   open(LOG,">> $logfile") or die "ERROR  $!\n";
    
    print LOG "$timestamp $string";
--- 117,121 ----
      $self->{LOGFILE}=$1;
    }
!   open(LOG,">> $logfile") or die "ERROR $logfile  $!\n";
    
    print LOG "$timestamp $string";

[Sprawler-commits] sprawler/lib Sprawler.pm,1.9,1.10

From: Mojo N. <moj...@us...> - 2004-08-18 03:41:59

Update of /cvsroot/sprawler/sprawler/lib
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9674/lib

Modified Files:
	Sprawler.pm 
Log Message:
Added creating of necessary directories.

Unfortunately master.pl fails on the first pass at creating master_indexes and log 
directories... on second try it works, something about trying to use the log file 
prior to the path being defined and it happens when the list needs to be seeded.

I'll try to fix it shortly.

mojo



Index: Sprawler.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler.pm,v
retrieving revision 1.9
retrieving revision 1.10
diff -C2 -d -r1.9 -r1.10
*** Sprawler.pm	25 Jun 2004 02:31:05 -0000	1.9
--- Sprawler.pm	18 Aug 2004 03:41:37 -0000	1.10
***************
*** 23,26 ****
--- 23,35 ----
        $self->{$var}=$value;
      }
+     if ($var=~/PATH/) {
+       if( $self->{$var}=~m/ARRAY/) {
+ 	print STDERR "VAR $var <>$self->{$var} \n";
+ 	
+       } else {
+ 	
+ 	$self->mkrdir($self->{$var});
+       }
+     }
    }
    close(CONFIG);
***************
*** 73,80 ****
    my $self=shift;
    my $path=shift;
    if (! -e "$path") {
      foreach my $d (split(/\//,$path)) {
        $dir.=$d."/";
!       if (! -e "$dir") {
  	mkdir("$dir") 
  	  || die "unable to create $dir\n" ;
--- 82,91 ----
    my $self=shift;
    my $path=shift;
+   
    if (! -e "$path") {
      foreach my $d (split(/\//,$path)) {
        $dir.=$d."/";
!       #print STDERR "DIR $dir";
!      if (! -e "$dir") {
  	mkdir("$dir") 
  	  || die "unable to create $dir\n" ;
***************
*** 83,87 ****
    }
  }
!   
  
  sub logf {
--- 94,98 ----
    }
  }
! 
  
  sub logf {
***************
*** 90,98 ****
--- 101,114 ----
    my $timestamp = scalar localtime;
    chomp($timestamp);
+   #print STDERR "LOG FILE $logfile\n";
    if($self->{LOGFILE}) {
      $logfile=$self->{LOGFILE};
      my $path=$logfile;
      $path=~s/(.*)\/(.*)$/$1/;
+     #print STDERR "PATH: $path\n";
+ 
+ 
      $self->mkrdir($path);
+ 
    } else {
      $logfile="/dev/null";
***************
*** 107,113 ****
      $self->{LOGFILE}=$1;
    }
!   open(LOG,">> $logfile");
    print LOG "$timestamp $string";
!   close;
  }
  
--- 123,131 ----
      $self->{LOGFILE}=$1;
    }
!   open(LOG,">> $logfile") or die "ERROR  $!\n";
!   
    print LOG "$timestamp $string";
!   
!   close LOG;
  }

[Sprawler-commits] sprawler/lib/Sprawler Master.pm,1.40,1.41

From: Mojo N. <moj...@us...> - 2004-08-18 03:41:59

Update of /cvsroot/sprawler/sprawler/lib/Sprawler
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9674/lib/Sprawler

Modified Files:
	Master.pm 
Log Message:
Added creating of necessary directories.

Unfortunately master.pl fails on the first pass at creating master_indexes and log 
directories... on second try it works, something about trying to use the log file 
prior to the path being defined and it happens when the list needs to be seeded.

I'll try to fix it shortly.

mojo



Index: Master.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Master.pm,v
retrieving revision 1.40
retrieving revision 1.41
diff -C2 -d -r1.40 -r1.41
*** Master.pm	25 Jun 2004 02:31:05 -0000	1.40
--- Master.pm	18 Aug 2004 03:41:37 -0000	1.41
***************
*** 59,63 ****
    $self->{URLPATHS}=[];
    $self->{INDEX_EXT}=[];
- 
    bless ($self, $class);
    return $self;
--- 59,62 ----
***************
*** 335,338 ****
--- 334,338 ----
    my $fh=shift;
    my %db=();
+   print STDERR "DBFILE $db_file\n";
    my $db_obj = tie %db, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE;
    my $fd = $db_obj->fd;

[Sprawler-commits] sprawler db-perftest.pl,1.2,1.3 master.pl,1.23,1.24

From: Mojo N. <moj...@us...> - 2004-08-18 03:41:58

Update of /cvsroot/sprawler/sprawler
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9674

Modified Files:
	db-perftest.pl master.pl 
Log Message:
Added creating of necessary directories.

Unfortunately master.pl fails on the first pass at creating master_indexes and log 
directories... on second try it works, something about trying to use the log file 
prior to the path being defined and it happens when the list needs to be seeded.

I'll try to fix it shortly.

mojo



Index: db-perftest.pl
===================================================================
RCS file: /cvsroot/sprawler/sprawler/db-perftest.pl,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** db-perftest.pl	3 Mar 2004 22:43:24 -0000	1.2
--- db-perftest.pl	18 Aug 2004 03:41:37 -0000	1.3
***************
*** 6,15 ****
  
  $pathtodbfiles = "/tmp/";
! $maxrecords = 5000000;
  
  $hashdb = $pathtodbfiles . "hash.db";
  $btreedb = $pathtodbfiles . "btree.db";
! 
  tie %testdb,  'DB_File', "$btreedb", O_RDWR|O_CREAT, 0644, $DB_BTREE;
  print "Starting db build..\n";
  &builddb;
--- 6,17 ----
  
  $pathtodbfiles = "/tmp/";
! $maxrecords = 3000000;
  
  $hashdb = $pathtodbfiles . "hash.db";
+ #$hashdb = undef;
  $btreedb = $pathtodbfiles . "btree.db";
! #$btreedb=undef;
  tie %testdb,  'DB_File', "$btreedb", O_RDWR|O_CREAT, 0644, $DB_BTREE;
+ #tie %testdb,  'DB_File', "$hashdb", O_RDWR|O_CREAT, 0644, $DB_HASH;
  print "Starting db build..\n";
  &builddb;

Index: master.pl
===================================================================
RCS file: /cvsroot/sprawler/sprawler/master.pl,v
retrieving revision 1.23
retrieving revision 1.24
diff -C2 -d -r1.23 -r1.24
*** master.pl	25 Jun 2004 02:31:04 -0000	1.23
--- master.pl	18 Aug 2004 03:41:37 -0000	1.24
***************
*** 56,62 ****
--- 56,64 ----
  
  $master->load($configfile);
+ 
  $master->set("DEBUG",1);
  
  
+ 
  ########################################################
  #  establish port and begin listening on it.           #

[Sprawler-commits] sprawler/lib/Sprawler Client.pm,1.41,1.42 Master.pm,1.39,1.40

From: Mojo N. <moj...@us...> - 2004-06-25 02:31:14

Update of /cvsroot/sprawler/sprawler/lib/Sprawler
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20891/lib/Sprawler

Modified Files:
	Client.pm Master.pm 
Log Message:
You know you've been away to long when you forget how to commit to cvs
 
Let's see add funct mkrdir make recursive directory for helping out with config file.
added hack to checkin client to to clientdb.
 
mojo




Index: Master.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Master.pm,v
retrieving revision 1.39
retrieving revision 1.40
diff -C2 -d -r1.39 -r1.40
*** Master.pm	28 Mar 2004 02:18:09 -0000	1.39
--- Master.pm	25 Jun 2004 02:31:05 -0000	1.40
***************
*** 97,101 ****
    my $socket=shift;
    my $urls=shift;
!   my $select=$self->{SELECT};
  
    my $bindata="";
--- 97,101 ----
    my $socket=shift;
    my $urls=shift;
!   #my $select=$self->{SELECT};
  
    my $bindata="";
***************
*** 257,260 ****
--- 257,261 ----
    my @urls=();
    my $ii=0;
+   print STDERR " FILE $url_file_statezero\n";
    my ($obj0, $fdesc0, $dbh0)=$self->open_db($url_file_statezero, "ZERO");
    my ($obj1, $fdesc1, $dbh1)=$self->open_db($url_file_stateone, "ONE");
***************
*** 263,267 ****
      if (int(rand(1000)) < 3) {
        $dbh1->{$key}="$clientid";
!       #      print "state 0 to 1 -> $key\n";
        delete $dbh0->{$key};
        push(@urls,$key);
--- 264,268 ----
      if (int(rand(1000)) < 3) {
        $dbh1->{$key}="$clientid";
!       print STDERR "state 0 to 1 -> $key\n";
        delete $dbh0->{$key};
        push(@urls,$key);
***************
*** 368,371 ****
--- 369,373 ----
    my $request=shift;
    my $index_path=$self->get("INDEX_PATH");
+   $self->mkrdir($index_path);
    my $clientenabled = undef;
    my %dbh0=();
***************
*** 373,378 ****
    my ($obj0, $fdesc0, $dbh0)=$self->open_db($client_checkoutdb, "CLIENTDB");
    my $clientid_tag = $clientid . "-" . $request;
!   my $junk = $dbh0->{$clientid_tag};
    my $clientid_return=$dbh0->{$clientid_tag};
    $self->close_db($obj0, $fdesc0, "CLIENTDB");
    return ($clientid_return);
--- 375,386 ----
    my ($obj0, $fdesc0, $dbh0)=$self->open_db($client_checkoutdb, "CLIENTDB");
    my $clientid_tag = $clientid . "-" . $request;
!   #my $junk = $dbh0->{$clientid_tag};
!   if($dbh0->{$clientid_tag}) {
!     
!   } else {
!     $dbh0->{$clientid_tag}=1;
!   }
    my $clientid_return=$dbh0->{$clientid_tag};
+   print STDERR "CLIENTID $clientid_return\n";
    $self->close_db($obj0, $fdesc0, "CLIENTDB");
    return ($clientid_return);

Index: Client.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Client.pm,v
retrieving revision 1.41
retrieving revision 1.42
diff -C2 -d -r1.41 -r1.42
*** Client.pm	1 Apr 2004 03:45:34 -0000	1.41
--- Client.pm	25 Jun 2004 02:31:05 -0000	1.42
***************
*** 564,568 ****
      $socket->recv($url, 1024);
      $socket->send("THANKS");
!     print "\rReceiving url $j of $urltotal";
      $urls{$url}=1;
      $j++;
--- 564,568 ----
      $socket->recv($url, 1024);
      $socket->send("THANKS");
!     print "\rReceiving url $j of $urltotal URL $url";
      $urls{$url}=1;
      $j++;

[Sprawler-commits] sprawler/lib Sprawler.pm,1.8,1.9

From: Mojo N. <moj...@us...> - 2004-06-25 02:31:13

Update of /cvsroot/sprawler/sprawler/lib
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20891/lib

Modified Files:
	Sprawler.pm 
Log Message:
You know you've been away to long when you forget how to commit to cvs
 
Let's see add funct mkrdir make recursive directory for helping out with config file.
added hack to checkin client to to clientdb.
 
mojo




Index: Sprawler.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler.pm,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** Sprawler.pm	25 Mar 2004 04:40:36 -0000	1.8
--- Sprawler.pm	25 Jun 2004 02:31:05 -0000	1.9
***************
*** 70,73 ****
--- 70,88 ----
  }
  
+ sub mkrdir {
+   my $self=shift;
+   my $path=shift;
+   if (! -e "$path") {
+     foreach my $d (split(/\//,$path)) {
+       $dir.=$d."/";
+       if (! -e "$dir") {
+ 	mkdir("$dir") 
+ 	  || die "unable to create $dir\n" ;
+       }
+     }
+   }
+ }
+   
+ 
  sub logf {
    my $self=shift;
***************
*** 76,80 ****
    chomp($timestamp);
    if($self->{LOGFILE}) {
!     $logfile=$self->{LOGFILE}
    } else {
      $logfile="/dev/null";
--- 91,98 ----
    chomp($timestamp);
    if($self->{LOGFILE}) {
!     $logfile=$self->{LOGFILE};
!     my $path=$logfile;
!     $path=~s/(.*)\/(.*)$/$1/;
!     $self->mkrdir($path);
    } else {
      $logfile="/dev/null";

[Sprawler-commits] sprawler master.pl,1.22,1.23

From: Mojo N. <moj...@us...> - 2004-06-25 02:31:13

Update of /cvsroot/sprawler/sprawler
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20891

Modified Files:
	master.pl 
Log Message:
You know you've been away to long when you forget how to commit to cvs
 
Let's see add funct mkrdir make recursive directory for helping out with config file.
added hack to checkin client to to clientdb.
 
mojo




Index: master.pl
===================================================================
RCS file: /cvsroot/sprawler/sprawler/master.pl,v
retrieving revision 1.22
retrieving revision 1.23
diff -C2 -d -r1.22 -r1.23
*** master.pl	28 Mar 2004 01:49:40 -0000	1.22
--- master.pl	25 Jun 2004 02:31:04 -0000	1.23
***************
*** 56,60 ****
  
  $master->load($configfile);
! $master->set("DEBUG",0);
  
  
--- 56,60 ----
  
  $master->load($configfile);
! $master->set("DEBUG",1);
  
  
***************
*** 119,135 ****
        
        my $action=$master->receive_request($socket,1024);
        if ($action =~ /^REQUEST_URL\s+(\d+)\s+(\S+)/) {
! 	      my $qtyurls=$1;
          my $clientid=$2;
!         my $retval=$master->check_clientid($clientid,"STATUS");
          if ($retval eq "1") {
  	        if ($max_req_urls < $qtyurls) { $qtyurls = $max_req_urls; }
  	        $action="";
  	        my @urls=$master->get_urls($qtyurls,$clientid);
  	        $master->logf("($line) child $child_id sending urls...\n"); 
  	        $master->send_urls($socket,\@urls);
  	        $master->logf("ok\n"); 
          } else {
            print STDERR "Client $clientid attempted to steal from us!\n";
          }
        } elsif ($action =~ /^SEND_INDEX\s+(\S+)/) {
--- 119,143 ----
        
        my $action=$master->receive_request($socket,1024);
+       print STDERR "ACTION $action\n";
        if ($action =~ /^REQUEST_URL\s+(\d+)\s+(\S+)/) {
! 	my $qtyurls=$1;
          my $clientid=$2;
!         my $retval=undef;
! 	$retval=$master->check_clientid($clientid,"STATUS");
! 	#$retval=1;
          if ($retval eq "1") {
  	        if ($max_req_urls < $qtyurls) { $qtyurls = $max_req_urls; }
  	        $action="";
  	        my @urls=$master->get_urls($qtyurls,$clientid);
+ 		foreach my $url (@urls) {
+ 		  print STDERR "URL\n";
+ 		}
  	        $master->logf("($line) child $child_id sending urls...\n"); 
+ 	        print STDERR "($line) child $child_id sending urls...\n";
  	        $master->send_urls($socket,\@urls);
  	        $master->logf("ok\n"); 
          } else {
            print STDERR "Client $clientid attempted to steal from us!\n";
+ 	  exit;
          }
        } elsif ($action =~ /^SEND_INDEX\s+(\S+)/) {

[Sprawler-commits] sprawler/lib/Sprawler Client.pm,1.40,1.41

From: Mojo N. <moj...@us...> - 2004-04-01 03:57:27

Update of /cvsroot/sprawler/sprawler/lib/Sprawler
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9493/lib/Sprawler

Modified Files:
	Client.pm 
Log Message:
recursivly creates index directory.



Index: Client.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Client.pm,v
retrieving revision 1.40
retrieving revision 1.41
diff -C2 -d -r1.40 -r1.41
*** Client.pm	28 Mar 2004 01:49:40 -0000	1.40
--- Client.pm	1 Apr 2004 03:45:34 -0000	1.41
***************
*** 459,467 ****
    my $tmp_db=$self->get("URL_DB");
  
    if (! -e "$index_path") {
!     mkdir("$index_path") 
!       || die "unable to create $index_path\n" ;
    }
! 
    my $urlhash=$self->checksum("$url");
    my $db_file=$index_path.$urlhash."\.db";
--- 459,473 ----
    my $tmp_db=$self->get("URL_DB");
  
+ 
    if (! -e "$index_path") {
!     foreach my $d (split(/\//,$index_path)) {
!       $dir.=$d."/";
!       if (! -e "$dir") {
! 	mkdir("$dir") 
! 	  || die "unable to create $dir\n" ;
!       }
!     }
    }
!   
    my $urlhash=$self->checksum("$url");
    my $db_file=$index_path.$urlhash."\.db";

[Sprawler-commits] sprawler/lib/Sprawler Master.pm,1.38,1.39

From: Eric A. <de...@us...> - 2004-03-28 02:29:23

Update of /cvsroot/sprawler/sprawler/lib/Sprawler
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3627/lib/Sprawler

Modified Files:
	Master.pm 
Log Message:
Getting tired..


Index: Master.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Master.pm,v
retrieving revision 1.38
retrieving revision 1.39
diff -C2 -d -r1.38 -r1.39
*** Master.pm	28 Mar 2004 02:15:44 -0000	1.38
--- Master.pm	28 Mar 2004 02:18:09 -0000	1.39
***************
*** 174,181 ****
    
    my $url=${$tmp_db}{URL};
    if ($dbh1->{$url}) {
!     my $stateone_clientid=$dbh1->{$url};
!   } else {
!     my $stateone_clientid="";
    }
    if ($stateone_clientid eq $clientid) {
--- 174,180 ----
    
    my $url=${$tmp_db}{URL};
+   my $stateone_clientid="";
    if ($dbh1->{$url}) {
!     $stateone_clientid=$dbh1->{$url};
    }
    if ($stateone_clientid eq $clientid) {

[Sprawler-commits] sprawler/lib/Sprawler Master.pm,1.37,1.38

From: Eric A. <de...@us...> - 2004-03-28 02:26:57

Update of /cvsroot/sprawler/sprawler/lib/Sprawler
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3262/lib/Sprawler

Modified Files:
	Master.pm 
Log Message:
- One more minor tweak/bug fix


Index: Master.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Master.pm,v
retrieving revision 1.37
retrieving revision 1.38
diff -C2 -d -r1.37 -r1.38
*** Master.pm	28 Mar 2004 01:49:40 -0000	1.37
--- Master.pm	28 Mar 2004 02:15:44 -0000	1.38
***************
*** 174,178 ****
    
    my $url=${$tmp_db}{URL};
!   my $stateone_clientid=$dbh1->{$url};
    if ($stateone_clientid eq $clientid) {
      # client checking in indexed db- change url from stage 1 to stage 2
--- 174,182 ----
    
    my $url=${$tmp_db}{URL};
!   if ($dbh1->{$url}) {
!     my $stateone_clientid=$dbh1->{$url};
!   } else {
!     my $stateone_clientid="";
!   }
    if ($stateone_clientid eq $clientid) {
      # client checking in indexed db- change url from stage 1 to stage 2

[Sprawler-commits] sprawler reg_harvester.pl,NONE,1.1

From: Eric A. <de...@us...> - 2004-03-28 02:21:00

Update of /cvsroot/sprawler/sprawler
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv2460

Added Files:
	reg_harvester.pl 
Log Message:
- Added tool for registering new client, preregistering a default, and checking status of a client.


--- NEW FILE: reg_harvester.pl ---
#!/usr/bin/perl -w


# Copyright (c) 2003, 2004 Sprawler Project
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
# 3. The name of the author may not be used to endorse or promote products
#    derived from this software without specific prior written permission.
#
# Alternatively, this software may be distributed under the terms of the
# GNU General Public License ("GPL") version 2 as published by the Free
# Software Foundation.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# $Id: reg_harvester.pl,v 1.1 2004/03/28 02:09:41 destari Exp $

use IO::Handle;
use lib "./lib";
use Sprawler::Master;
use DB_File;

$email = shift;
$preregister = shift;

# open config file and read in settings
my $configfile = "master.conf";

my $master=Sprawler::Master->new();

$master->load($configfile);

if ($email =~ /\@.+\-\d+/) {
  if ($preregister) {
    &preregister_clientid($email,"STATUS");
  }
  my $retval=$master->check_clientid($email,"STATUS");
  print "Found -> $retval\n";
} else {
  &register_clientid($email,"STATUS");
}

sub preregister_clientid {
  my $clientid=shift;
  my $request=shift;
  my $index_path=$master->get("INDEX_PATH");
  my $clientid_tag = $clientid . "-$request";
  my $client_checkoutdb = $index_path."client_checkout.db";
  my ($obj0, $fdesc0, $dbh0)=$master->open_db($client_checkoutdb, "CLIENTDBPRE");
  $dbh0->{$clientid_tag} = 1;
  print "Pre-registered clientid: $clientid clientid_tag: $clientid_tag\n";
  $master->close_db($obj0, $fdesc0, "CLIENTDBPRE");
}

sub register_clientid {
  my $clientemail=shift;
  my $request=shift;
  my $index_path=$master->get("INDEX_PATH");
  my $time = time();
  srand($time);
  my $random = int(rand(1000));
  my $clientid = "$email-$random$time";
  my $clientid_tag = $clientid . "-$request";
  my $client_checkoutdb = $index_path."client_checkout.db";
  my ($obj1, $fdesc1, $dbh1)=$master->open_db($client_checkoutdb, "CLIENTDB");
  $dbh1->{$clientid_tag} = 1;
  print "Registered $email with clientid: $clientid clientid_tag: $clientid_tag\n";
  $master->close_db($obj1, $fdesc1, "CLIENTDB");
}

[Sprawler-commits] sprawler/lib/Sprawler Client.pm,1.39,1.40 Master.pm,1.36,1.37

From: Eric A. <de...@us...> - 2004-03-28 02:00:53

Update of /cvsroot/sprawler/sprawler/lib/Sprawler
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv31736/lib/Sprawler

Modified Files:
	Client.pm Master.pm 
Log Message:
- Added routines to check for client validity.
- Clients can now only check in url info for urls they themselves have checked out
- Client must "register" first before being able to run.
- Minor bug fixes and tweaks
- some minor whitespace fixing
- Config file changes (subtle)
- Uses a default pre-registered clientid, but in the future, users will have to register their own.


Index: Master.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Master.pm,v
retrieving revision 1.36
retrieving revision 1.37
diff -C2 -d -r1.36 -r1.37
*** Master.pm	25 Mar 2004 04:40:36 -0000	1.36
--- Master.pm	28 Mar 2004 01:49:40 -0000	1.37
***************
*** 114,117 ****
--- 114,118 ----
    my $self=shift;
    my $socket=shift;
+   my $clientid=shift;
    my $msg=undef;
    my $index_path=$self->{INDEX_PATH};
***************
*** 138,154 ****
      $filerecvsize = length $file;
    }
!   print STDERR "RECEIVED index $filename\n";
!   print STDERR "Filesizes: $filesize ", length($file), "\n";
!   $self->logf("RECEIVED index $filename\n");
    $self->logf("Filesizes: $filesize \n");
    if($filesize == length($file)) {
-     $socket->send("0");
-   } else {
      $socket->send("1");
    }
    $db_file=$index_path.$filename;
    $tmp_file=$index_path.$filename.".mem";
    if ( $tmp_file =~ /(.*)/ ) {
!     $tmp_file=1;
    }
  
--- 139,155 ----
      $filerecvsize = length $file;
    }
!   print STDERR "RECEIVED $filesize bytes of ", length($file), " in $filename\n";
!   $self->logf("RECEIVED index $filename from $clientid\n");
    $self->logf("Filesizes: $filesize \n");
    if($filesize == length($file)) {
      $socket->send("1");
+   } else {
+     $socket->send("0");
    }
    $db_file=$index_path.$filename;
    $tmp_file=$index_path.$filename.".mem";
+   # what is this statement supposed to accomplish?
    if ( $tmp_file =~ /(.*)/ ) {
!     $tmp_file=$1;
    }
  
***************
*** 156,177 ****
    print INDEX $file;
    close INDEX;
!   
    $tmp_db = retrieve("$tmp_file");
!   #foreach $key (keys %{$tmp_db}) {
!    # print "KEY $key VALUE ${$tmp_db}{$key}";
!   #}
!   print STDERR "WRITING index $db_file \n";
    $self->logf("WRITING index $db_file\n");
    
    unlink($tmp_file);
    my ($obj0,$fdesc0,$dbh0)=$self->open_db($db_file, "RECEIVE");
-   %{$dbh0}=%{$tmp_db};
-   $self->close_db($obj0, $fdesc0, "RECEIVE");
    
!   print STDERR "extracting new urls\n";
!   $self->logf("extracting new urls\n");
    my @url=undef;
    @url=$self->extract_urls($db_file);
!   $self->add_urls(\@url);
  
  }
--- 157,199 ----
    print INDEX $file;
    close INDEX;
!     
    $tmp_db = retrieve("$tmp_file");
!   
!   #print STDERR "WRITING index $db_file \n";
    $self->logf("WRITING index $db_file\n");
    
    unlink($tmp_file);
+   
    my ($obj0,$fdesc0,$dbh0)=$self->open_db($db_file, "RECEIVE");
    
!   my $url_file_stateone=$index_path . "master_urls_state1.db";
!   my ($obj1, $fdesc1, $dbh1)=$self->open_db($url_file_stateone, "ONE");
!   
!   my $url_file_statetwo=$index_path . "master_urls_state2.db";
!   my ($obj2, $fdesc2, $dbh2)=$self->open_db($url_file_statetwo, "TWO");
!   
!   my $url=${$tmp_db}{URL};
!   my $stateone_clientid=$dbh1->{$url};
!   if ($stateone_clientid eq $clientid) {
!     # client checking in indexed db- change url from stage 1 to stage 2
!     delete $dbh1->{$url};
!     $dbh2->{$url}=$clientid;
!     %{$dbh0}=%{$tmp_db};
!     $self->close_db($obj0, $fdesc0, "RECEIVE");
!   } else {
!     # this client did not check out this URL!
!     # quietly ignore the data
!     $self->close_db($obj0, $fdesc0, "RECEIVE");
!     unlink($db_file);
!     print STDERR "Client did not check out this url!: $url\n";
!   }
!   $self->close_db($obj1, $fdesc1, "ONE");
!   $self->close_db($obj2, $fdesc2, "TWO");
!   
!   print STDERR "Extracting new urls\n";
!   $self->logf("Extracting new urls\n");
    my @url=undef;
    @url=$self->extract_urls($db_file);
!   $self->add_urls(\@url,$clientid);
  
  }
***************
*** 217,228 ****
--- 239,255 ----
  
  sub get_urls {
+   no strict 'refs';
+ 
    my $self=shift;
    my $n_urls=shift;
+   my $clientid=shift;
    my $r_urls=0;
    my $indexpath=$self->get("INDEX_PATH");
    my $url_file_statezero=$indexpath . "master_urls_state0.db";
    my $url_file_stateone=$indexpath . "master_urls_state1.db";
+   my $client_checkoutdb=$indexpath . "client_checkout.db";
    my %dbh0=();
    my %dbh1=();
+   my %dbh2=();
    my @urls=();
    my $ii=0;
***************
*** 232,236 ****
      my ($key,$value) = each %{$dbh0};
      if (int(rand(1000)) < 3) {
!       $dbh1->{$key}="client id";
        #      print "state 0 to 1 -> $key\n";
        delete $dbh0->{$key};
--- 259,263 ----
      my ($key,$value) = each %{$dbh0};
      if (int(rand(1000)) < 3) {
!       $dbh1->{$key}="$clientid";
        #      print "state 0 to 1 -> $key\n";
        delete $dbh0->{$key};
***************
*** 249,252 ****
--- 276,280 ----
    my $self=shift;
    my $urls=shift;
+   my $clientid=shift;
    my %url_db_0=();
    my %url_db_1=();
***************
*** 265,278 ****
  
    foreach my $url (@{$urls}) {
!     if(($dbh0->{$url}) || ($dbh1->{$url}) || ($dbh2->{$url}) || ($dbh3->{$url})) {
        # dont index again
      } else {
        # add it to the state 0 (to be indexed) db
!       $dbh0->{$url}="client id";
        $ii++;
      }
      $jj++;
    }
!   $self->logf("added  $ii (of $jj) to the master_urls_state0.db\n");
    $self->close_db($obj0, $fdesc0, "ZERO");
    $self->close_db($obj1, $fdesc1, "ONE");
--- 293,306 ----
  
    foreach my $url (@{$urls}) {
!     if((exists $dbh0->{$url}) || (exists $dbh1->{$url}) || (exists $dbh2->{$url}) || (exists $dbh3->{$url})) {
        # dont index again
      } else {
        # add it to the state 0 (to be indexed) db
!       $dbh0->{$url}="$clientid";
        $ii++;
      }
      $jj++;
    }
!   $self->logf("added  $ii (of $jj) to the master_urls_state0.db by client $clientid\n");
    $self->close_db($obj0, $fdesc0, "ZERO");
    $self->close_db($obj1, $fdesc1, "ONE");
***************
*** 302,306 ****
    my $db_file=shift;
    my $fh=shift;
!   my %db={};
    my $db_obj = tie %db, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE;
    my $fd = $db_obj->fd;
--- 330,334 ----
    my $db_file=shift;
    my $fh=shift;
!   my %db=();
    my $db_obj = tie %db, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE;
    my $fd = $db_obj->fd;
***************
*** 311,315 ****
    
    open($fh, "$file") or die "fdopen $file $!";
! #  open($fh, "+<&=$fd") or die "fdopen $!";
    unless (flock ($fh, LOCK_EX | LOCK_NB)) {
      unless (flock ($fh, LOCK_EX)) { die "flock: $!" }
--- 339,343 ----
    
    open($fh, "$file") or die "fdopen $file $!";
!   #open($fh, "+<&=$fd") or die "fdopen $!";
    unless (flock ($fh, LOCK_EX | LOCK_NB)) {
      unless (flock ($fh, LOCK_EX)) { die "flock: $!" }
***************
*** 330,333 ****
--- 358,379 ----
  }
  
+ sub check_clientid {
+   no strict 'refs';
+ 
+   my $self=shift;
+   my $clientid=shift;
+   my $request=shift;
+   my $index_path=$self->get("INDEX_PATH");
+   my $clientenabled = undef;
+   my %dbh0=();
+   my $client_checkoutdb = $index_path."client_checkout.db";
+   my ($obj0, $fdesc0, $dbh0)=$self->open_db($client_checkoutdb, "CLIENTDB");
+   my $clientid_tag = $clientid . "-" . $request;
+   my $junk = $dbh0->{$clientid_tag};
+   my $clientid_return=$dbh0->{$clientid_tag};
+   $self->close_db($obj0, $fdesc0, "CLIENTDB");
+   return ($clientid_return);
+ }
+ 
  sub local_index_dir {
    my $self=shift;

Index: Client.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Client.pm,v
retrieving revision 1.39
retrieving revision 1.40
diff -C2 -d -r1.39 -r1.40
*** Client.pm	25 Mar 2004 05:44:04 -0000	1.39
--- Client.pm	28 Mar 2004 01:49:40 -0000	1.40
***************
*** 91,95 ****
    $self->{REQUEST_FILE};
    $self->{MASTER_REQUEST};
!   $self->{CLIENTID};
    
    $self->{SOCKET}=undef;
--- 91,95 ----
    $self->{REQUEST_FILE};
    $self->{MASTER_REQUEST};
!   $self->{CLIENT_ID};
    
    $self->{SOCKET}=undef;
***************
*** 469,483 ****
    store $tmp_db, "$db_file\.tmp";
    rename("$db_file\.tmp", "$db_file");
!   
!   #tie %url_db,  'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE;
!   #%url_db=%{$tmp_db};
!   #untie %url_db;
!   #	%{$tmp_db}=undef;
!   #	$tmp_db=undef;
    rename("$index_path/url.db.new","$index_path/url.db");
    unlink "$index_path/url.db.new";
  
-   #  tie %db, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE;
-   
    return 1;
  }
--- 469,476 ----
    store $tmp_db, "$db_file\.tmp";
    rename("$db_file\.tmp", "$db_file");
! 
    rename("$index_path/url.db.new","$index_path/url.db");
    unlink "$index_path/url.db.new";
  
    return 1;
  }
***************
*** 502,510 ****
    my $self=shift;
    my $filename=shift;
!   my $socket=shift;;
! 
    my $index_path=$self->{INDEX_PATH};
    my $file=undef;
!   $socket->send("SEND_INDEX");
    $socket->recv($msg,1024);
    if ($msg eq "READY") {
--- 495,502 ----
    my $self=shift;
    my $filename=shift;
!   my $socket=shift;
    my $index_path=$self->{INDEX_PATH};
    my $file=undef;
!   $socket->send("SEND_INDEX $self->{CLIENT_ID}");
    $socket->recv($msg,1024);
    if ($msg eq "READY") {
***************
*** 549,552 ****
--- 541,545 ----
    my $self=shift;
    my $socket=shift;
+   my $clientid=$self->{CLIENT_ID};
    my $urltotal=$self->{URLS_TO_INDEX};
    my $index_path=$self->get("INDEX_PATH");
***************
*** 556,560 ****
  
    print "Requesting urls\n";
!   $socket->send("REQUEST_URL $urltotal");
    #start waiting for $urls;
    #my $ii=0;
--- 549,553 ----
  
    print "Requesting urls\n";
!   $socket->send("REQUEST_URL $urltotal $clientid");
    #start waiting for $urls;
    #my $ii=0;

[Sprawler-commits] sprawler indexer.conf,1.8,1.9 indexer.pl,1.26,1.27 master.pl,1.21,1.22

From: Eric A. <de...@us...> - 2004-03-28 02:00:52

Update of /cvsroot/sprawler/sprawler
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv31736

Modified Files:
	indexer.conf indexer.pl master.pl 
Log Message:
- Added routines to check for client validity.
- Clients can now only check in url info for urls they themselves have checked out
- Client must "register" first before being able to run.
- Minor bug fixes and tweaks
- some minor whitespace fixing
- Config file changes (subtle)
- Uses a default pre-registered clientid, but in the future, users will have to register their own.


Index: indexer.pl
===================================================================
RCS file: /cvsroot/sprawler/sprawler/indexer.pl,v
retrieving revision 1.26
retrieving revision 1.27
diff -C2 -d -r1.26 -r1.27
*** indexer.pl	25 Mar 2004 05:44:03 -0000	1.26
--- indexer.pl	28 Mar 2004 01:49:38 -0000	1.27
***************
*** 130,136 ****
      $totalurls=@urls;
      $| = 1;
!     $k=0;
    	foreach $url (@urls) {
!     	print "\rIndexing url $k of $totalurls";
        $client->indexer($url);
        $client->flush_db($url);
--- 130,136 ----
      $totalurls=@urls;
      $| = 1;
!     $k=1;
    	foreach $url (@urls) {
!     	print "\rIndexing url $k of $totalurls ";
        $client->indexer($url);
        $client->flush_db($url);
***************
*** 138,161 ****
        $k++;
    	};
!     print " -> Done.\n";
      
    	opendir(INDEX_PATH,"$index_path");
!   	@files=grep {! /^\./}  readdir(INDEX_PATH);
!   	#my $socket=$client->connect($hostname);
      my $totalfiles=@files;
      $| = 1;
!     $j = 0;
    	foreach my $file (@files) {
!       print "\rSending index $j of $totalfiles";
    	  if($file =~ /[a-f0-9]{32}\.db/i) {
          $filetosend=$file;
          my $socket=$client->connect($hostname,$port);
!         $client->send_index($filetosend,$socket);
!         unlink("$index_path$filetosend");
!         $socket->close;
    	  }
        $j++;
    	}
!     print " -> Done.\n";
      $| = 0;
  
--- 138,164 ----
        $k++;
    	};
!     print "-> Done.\n";
      
    	opendir(INDEX_PATH,"$index_path");
!   	#@files=grep {! /^\./}  readdir(INDEX_PATH);
!     @files=grep {/[a-f0-9]{32}\.db/}  readdir(INDEX_PATH);
      my $totalfiles=@files;
      $| = 1;
!     $j = 1;
    	foreach my $file (@files) {
!       print "\rSending index $j of $totalfiles ";
    	  if($file =~ /[a-f0-9]{32}\.db/i) {
          $filetosend=$file;
          my $socket=$client->connect($hostname,$port);
!         if ($client->send_index($filetosend,$socket)) {
!           unlink("$index_path$filetosend");
!           $socket->close;
!         } else {
!           print "\nERROR: sending file $filetosend.\n";
!         }
    	  }
        $j++;
    	}
!     print "-> Done.\n";
      $| = 0;
  

Index: master.pl
===================================================================
RCS file: /cvsroot/sprawler/sprawler/master.pl,v
retrieving revision 1.21
retrieving revision 1.22
diff -C2 -d -r1.21 -r1.22
*** master.pl	25 Mar 2004 04:40:35 -0000	1.21
--- master.pl	28 Mar 2004 01:49:40 -0000	1.22
***************
*** 66,70 ****
  ########################################################
  
! $master->seed_urls();$master->logf("\n\n\nseeding urls...\n");
  
  
--- 66,71 ----
  ########################################################
  
! $master->seed_urls();
! $master->logf("\n\n\nseeding urls...\n");
  
  
***************
*** 117,140 ****
        close PARENT;
        
-       #(my $parent_id, my $cc)=split(/\|/,$tmp);
-       #print "TMP $parent_id <> $cc\n";
        my $action=$master->receive_request($socket,1024);
!       if ($action =~ /^REQUEST_URL\s+(\d+)/) {
! 	my $qtyurls=$1;
! 	if ($max_req_urls < $qtyurls) { $qtyurls = $max_req_urls; }
! 	$action="";
! 	my @urls=$master->get_urls($qtyurls);
! 	$master->logf("($line) child $child_id sending urls...\n"); 
! 	$master->send_urls($socket,\@urls);
! 	$master->logf("ok\n"); 
! 	# now we should mark the urls as "out for indexing", and save into hash
! 	#
!       } elsif ($action eq "SEND_INDEX") {
! 	$action=""; 
! 	$master->logf("($line) child $child_id recieving indexes...\n"); 
! 	$master->receive_index($socket);
! 	$master->logf("ok\n"); 
        } else {
! 	$master->logf("child recieved request $action\n");
        }
        
--- 118,149 ----
        close PARENT;
        
        my $action=$master->receive_request($socket,1024);
!       if ($action =~ /^REQUEST_URL\s+(\d+)\s+(\S+)/) {
! 	      my $qtyurls=$1;
!         my $clientid=$2;
!         my $retval=$master->check_clientid($clientid,"STATUS");
!         if ($retval eq "1") {
! 	        if ($max_req_urls < $qtyurls) { $qtyurls = $max_req_urls; }
! 	        $action="";
! 	        my @urls=$master->get_urls($qtyurls,$clientid);
! 	        $master->logf("($line) child $child_id sending urls...\n"); 
! 	        $master->send_urls($socket,\@urls);
! 	        $master->logf("ok\n"); 
!         } else {
!           print STDERR "Client $clientid attempted to steal from us!\n";
!         }
!       } elsif ($action =~ /^SEND_INDEX\s+(\S+)/) {
!         my $clientid = $1;
!         my $retval=$master->check_clientid($clientid,"STATUS");
!         if ($retval eq "1") {
! 	        $action=""; 
! 	        $master->logf("($line) child $child_id receiving indexes...\n"); 
! 	        $master->receive_index($socket,$clientid);
! 	        $master->logf("ok\n"); 
!         } else {
!           print STDERR "Client $clientid attempted to trick us!\n";
!         }
        } else {
! 	      $master->logf("child received request $action\n");
        }
        
***************
*** 151,155 ****
    my $child;
    while ((my $waitedpid = waitpid(-1,WNOHANG)) > 0) {
!     logmsg "reaped $waitedpid" . ($? ? " with exit $?" : '');
    }
    $SIG{CHLD} = \&REAPER;  # loathe sysV
--- 160,164 ----
    my $child;
    while ((my $waitedpid = waitpid(-1,WNOHANG)) > 0) {
!     #logmsg "reaped $waitedpid" . ($? ? " with exit $?" : '');
    }
    $SIG{CHLD} = \&REAPER;  # loathe sysV

Index: indexer.conf
===================================================================
RCS file: /cvsroot/sprawler/sprawler/indexer.conf,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** indexer.conf	25 Mar 2004 04:40:35 -0000	1.8
--- indexer.conf	28 Mar 2004 01:49:38 -0000	1.9
***************
*** 9,13 ****
  
  URLS_TO_INDEX = 20
! CLIENT_ID = TESTER1
  DEFAULT_SERVER = beta.sprawler.com
  DEFAULT_SERVER_PORT = 5555
--- 9,13 ----
  
  URLS_TO_INDEX = 20
! CLIENT_ID = tes...@sp...-1031080407379
  DEFAULT_SERVER = beta.sprawler.com
  DEFAULT_SERVER_PORT = 5555
***************
*** 15,19 ****
  # interval in minutes
  reindex_interval = 1440 
! INDEX_TYPES = text/html
! #index_ext = html,txt
  
--- 15,18 ----
  # interval in minutes
  reindex_interval = 1440 
! INDEX_TYPES = text/html text/plain

[Sprawler-commits] sprawler/lib Extract.pm,1.16,1.17

From: Eric A. <de...@us...> - 2004-03-25 05:55:31

Update of /cvsroot/sprawler/sprawler/lib
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv2943/lib

Modified Files:
	Extract.pm 
Log Message:
- changed indexer output to a more clean format, now that the indexer is "settling" down a bit.  Less verbose, but still tells the story.


Index: Extract.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Extract.pm,v
retrieving revision 1.16
retrieving revision 1.17
diff -C2 -d -r1.16 -r1.17
*** Extract.pm	25 Mar 2004 04:40:36 -0000	1.16
--- Extract.pm	25 Mar 2004 05:44:03 -0000	1.17
***************
*** 231,235 ****
    $p->parse($text);
    $n_links=@links;
!   print "found $n_links links on $url .....\n";
    return (\@links, \@emails);
  }
--- 231,235 ----
    $p->parse($text);
    $n_links=@links;
!   #print "found $n_links links on $url ..\n";
    return (\@links, \@emails);
  }

[Sprawler-commits] sprawler/lib/Sprawler Client.pm,1.38,1.39

From: Eric A. <de...@us...> - 2004-03-25 05:54:58

Update of /cvsroot/sprawler/sprawler/lib/Sprawler
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv2943/lib/Sprawler

Modified Files:
	Client.pm 
Log Message:
- changed indexer output to a more clean format, now that the indexer is "settling" down a bit.  Less verbose, but still tells the story.


Index: Client.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Client.pm,v
retrieving revision 1.38
retrieving revision 1.39
diff -C2 -d -r1.38 -r1.39
*** Client.pm	25 Mar 2004 05:06:45 -0000	1.38
--- Client.pm	25 Mar 2004 05:44:04 -0000	1.39
***************
*** 515,519 ****
      }
      $filesize=length $file;
!     print "SENDING $filename ($filesize)\n";
  
      $socket->send($filename);
--- 515,519 ----
      }
      $filesize=length $file;
!     #print "SENDING $filename ($filesize)\n";
  
      $socket->send($filename);
***************
*** 529,538 ****
        $start = 1024 * $counter;
        $buffer = substr($file, $start, 1024);
!       print "#";
        $socket->send($buffer);
        $counter++;
        $datasent = $counter * 1024;
      }
!     print "\n";
      $socket->recv("$msg",1024);
      if($msg eq "0") {
--- 529,538 ----
        $start = 1024 * $counter;
        $buffer = substr($file, $start, 1024);
!       #print "#";
        $socket->send($buffer);
        $counter++;
        $datasent = $counter * 1024;
      }
!     #print "\n";
      $socket->recv("$msg",1024);
      if($msg eq "0") {
***************
*** 555,569 ****
    tie %urls,  'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE;
  
!   print "requesting urls\n";
    $socket->send("REQUEST_URL $urltotal");
    #start waiting for $urls;
    #my $ii=0;
    my $url=undef;
    while($url ne "COMPLETE") {
      $socket->recv($url, 1024);
      $socket->send("THANKS");
!     print "RECEIVING $url\n";
      $urls{$url}=1;
    }
    delete $urls{"COMPLETE"};
    delete $urls{""};
--- 555,573 ----
    tie %urls,  'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE;
  
!   print "Requesting urls\n";
    $socket->send("REQUEST_URL $urltotal");
    #start waiting for $urls;
    #my $ii=0;
    my $url=undef;
+   my $j=0;
+   $|=1;
    while($url ne "COMPLETE") {
      $socket->recv($url, 1024);
      $socket->send("THANKS");
!     print "\rReceiving url $j of $urltotal";
      $urls{$url}=1;
+     $j++;
    }
+   print " -> Done.\n";
    delete $urls{"COMPLETE"};
    delete $urls{""};

[Sprawler-commits] sprawler indexer.pl,1.25,1.26

From: Eric A. <de...@us...> - 2004-03-25 05:54:52

Update of /cvsroot/sprawler/sprawler
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv2943

Modified Files:
	indexer.pl 
Log Message:
- changed indexer output to a more clean format, now that the indexer is "settling" down a bit.  Less verbose, but still tells the story.


Index: indexer.pl
===================================================================
RCS file: /cvsroot/sprawler/sprawler/indexer.pl,v
retrieving revision 1.25
retrieving revision 1.26
diff -C2 -d -r1.25 -r1.26
*** indexer.pl	25 Mar 2004 04:40:35 -0000	1.25
--- indexer.pl	25 Mar 2004 05:44:03 -0000	1.26
***************
*** 128,143 ****
  	if (@urls) {
    	#print "indexing urls\n";
    	foreach $url (@urls) {
!     	#print "Indexing $url\n";
        $client->indexer($url);
        $client->flush_db($url);
        $client->remove_url($url);
    	};
! 
!   	print "Sending indexes\n";
    	opendir(INDEX_PATH,"$index_path");
    	@files=grep {! /^\./}  readdir(INDEX_PATH);
    	#my $socket=$client->connect($hostname);
    	foreach my $file (@files) {
    	  if($file =~ /[a-f0-9]{32}\.db/i) {
          $filetosend=$file;
--- 128,151 ----
  	if (@urls) {
    	#print "indexing urls\n";
+     $totalurls=@urls;
+     $| = 1;
+     $k=0;
    	foreach $url (@urls) {
!     	print "\rIndexing url $k of $totalurls";
        $client->indexer($url);
        $client->flush_db($url);
        $client->remove_url($url);
+       $k++;
    	};
!     print " -> Done.\n";
!     
    	opendir(INDEX_PATH,"$index_path");
    	@files=grep {! /^\./}  readdir(INDEX_PATH);
    	#my $socket=$client->connect($hostname);
+     my $totalfiles=@files;
+     $| = 1;
+     $j = 0;
    	foreach my $file (@files) {
+       print "\rSending index $j of $totalfiles";
    	  if($file =~ /[a-f0-9]{32}\.db/i) {
          $filetosend=$file;
***************
*** 147,151 ****
--- 155,162 ----
          $socket->close;
    	  }
+       $j++;
    	}
+     print " -> Done.\n";
+     $| = 0;

[Sprawler-commits] sprawler/lib/Sprawler Client.pm,1.37,1.38

From: Eric A. <de...@us...> - 2004-03-25 05:17:29

Update of /cvsroot/sprawler/sprawler/lib/Sprawler
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv30378/lib/Sprawler

Modified Files:
	Client.pm 
Log Message:
- added title logging, fixed some items that were missed.


Index: Client.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Client.pm,v
retrieving revision 1.37
retrieving revision 1.38
diff -C2 -d -r1.37 -r1.38
*** Client.pm	25 Mar 2004 04:40:36 -0000	1.37
--- Client.pm	25 Mar 2004 05:06:45 -0000	1.38
***************
*** 74,82 ****
    $self->{CONTENT_TYPE}=undef;
    $self->{TITLEWORDS}=[];
    $self->{BOLDWORDS}=[];
    $self->{ITALICWORDS}=[];
    $self->{HTMLWORDS}=[];
    $self->{SHORTWORDS}=[];
! 
    $self->{STOPWORDS}={};
    
--- 74,89 ----
    $self->{CONTENT_TYPE}=undef;
    $self->{TITLEWORDS}=[];
+   $self->{TITLE}=undef;
    $self->{BOLDWORDS}=[];
    $self->{ITALICWORDS}=[];
    $self->{HTMLWORDS}=[];
    $self->{SHORTWORDS}=[];
!   $self->{URLS}=[];
!   $self->{EMAILS}=[];
!   $self->{PREWORDS}=[];
!   $self->{LIWORDS}=[];
!   $self->{HEADERWORDS}=[];
!   $self->{MARQUEEWORDS}=[];
!   
    $self->{STOPWORDS}={};
    
***************
*** 335,343 ****
    }
    $parser->set_tag('title');
!   my $title  = $parser -> extract_text ("$doctext");
  
    @titlewords=undef;
    @{$self->{TITLEWORDS}}=undef;
!   @titlewords=$self->clean($parser->get_words($parser -> extract_text ("$doctext")));
    @{$self->{TITLEWORDS}}=@titlewords;
  
--- 342,351 ----
    }
    $parser->set_tag('title');
!   my $title = $parser->extract_text("$doctext");
  
    @titlewords=undef;
    @{$self->{TITLEWORDS}}=undef;
!   $self->{TITLE}=$title;
!   @titlewords=$self->clean($parser->get_words($self->{TITLE}));
    @{$self->{TITLEWORDS}}=@titlewords;
  
***************
*** 398,401 ****
--- 406,410 ----
    my @indextypes = ("TITLEWORDS", "BOLDWORDS", "ITALICWORDS", "URLS", "EMAILS", "PREWORDS", "LIWORDS", "HEADERWORDS", "MARQUEEWORDS");
    ${$self->{URL_DB}}{URL} = $document;
+   ${$self->{URL_DB}}{TITLE} = $self->{TITLE};
    foreach my $indextype (@indextypes) {
      ${$self->{URL_DB}}{$indextype} = undef;

[Sprawler-commits] sprawler/lib/Sprawler Client.pm,1.36,1.37 Master.pm,1.35,1.36

From: Eric A. <de...@us...> - 2004-03-25 04:51:20

Update of /cvsroot/sprawler/sprawler/lib/Sprawler
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26241/lib/Sprawler

Modified Files:
	Client.pm Master.pm 
Log Message:
- added function from Ilya to check headers for content types
- small bug fixes
- other little stuff


Index: Master.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Master.pm,v
retrieving revision 1.35
retrieving revision 1.36
diff -C2 -d -r1.35 -r1.36
*** Master.pm	20 Mar 2004 04:26:16 -0000	1.35
--- Master.pm	25 Mar 2004 04:40:36 -0000	1.36
***************
*** 302,306 ****
    my $db_file=shift;
    my $fh=shift;
!   my %db=();
    my $db_obj = tie %db, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE;
    my $fd = $db_obj->fd;
--- 302,306 ----
    my $db_file=shift;
    my $fh=shift;
!   my %db={};
    my $db_obj = tie %db, 'DB_File', "$db_file", O_RDWR|O_CREAT, 0644, $DB_BTREE;
    my $fd = $db_obj->fd;
***************
*** 308,312 ****
    if ($file =~ /(\+\<\&)\=(.*)/g) {
      $file=$1."=".$2;
!   }
    
    open($fh, "$file") or die "fdopen $file $!";
--- 308,312 ----
    if ($file =~ /(\+\<\&)\=(.*)/g) {
      $file=$1."=".$2;
!   } 
    
    open($fh, "$file") or die "fdopen $file $!";

Index: Client.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Client.pm,v
retrieving revision 1.36
retrieving revision 1.37
diff -C2 -d -r1.36 -r1.37
*** Client.pm	16 Mar 2004 23:18:31 -0000	1.36
--- Client.pm	25 Mar 2004 04:40:36 -0000	1.37
***************
*** 68,75 ****
    $self->{DOCPATHS}=[];
    $self->{URLPATHS}=[];
!   $self->{INDEX_EXT}=[];
    $self->{LANGUAGES}=[];
    $self->{CHECKOUTDIR}=[];
  
    $self->{TITLEWORDS}=[];
    $self->{BOLDWORDS}=[];
--- 68,76 ----
    $self->{DOCPATHS}=[];
    $self->{URLPATHS}=[];
!   $self->{INDEX_TYPES}=[];
    $self->{LANGUAGES}=[];
    $self->{CHECKOUTDIR}=[];
  
+   $self->{CONTENT_TYPE}=undef;
    $self->{TITLEWORDS}=[];
    $self->{BOLDWORDS}=[];
***************
*** 301,307 ****
    my $document=shift;
    my @array=undef;
    my $doctext=LWP::Simple::get($document);
-   #my $baseurl=undef;
- 	#my $domainbase=undef;
  	chomp $document;
  	
--- 302,308 ----
    my $document=shift;
    my @array=undef;
+   my @docheader=LWP::Simple::head($document);
+   return if (!(@docheader));	#document unavailable
    my $doctext=LWP::Simple::get($document);
  	chomp $document;
  	
***************
*** 328,332 ****
  
    $docsize=length($doctext);
! 
    $parser->set_tag('title');
    my $title  = $parser -> extract_text ("$doctext");
--- 329,337 ----
  
    $docsize=length($doctext);
!   $self->{CONTENT_TYPE} = $parser -> extract_header("@docheader");
!   if (grep {! /^$self->{CONTENT_TYPE}/i}  @{$self->{INDEX_TYPES}}) {
!     print "Skipping $document - content type: $self->{CONTENT_TYPE}\n";
!     return; 
!   }
    $parser->set_tag('title');
    my $title  = $parser -> extract_text ("$doctext");

[Sprawler-commits] sprawler/lib Extract.pm,1.15,1.16 Sprawler.pm,1.7,1.8

From: Eric A. <de...@us...> - 2004-03-25 04:51:20

Update of /cvsroot/sprawler/sprawler/lib
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26241/lib

Modified Files:
	Extract.pm Sprawler.pm 
Log Message:
- added function from Ilya to check headers for content types
- small bug fixes
- other little stuff


Index: Extract.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Extract.pm,v
retrieving revision 1.15
retrieving revision 1.16
diff -C2 -d -r1.15 -r1.16
*** Extract.pm	14 Mar 2004 05:54:25 -0000	1.15
--- Extract.pm	25 Mar 2004 04:40:36 -0000	1.16
***************
*** 116,119 ****
--- 116,127 ----
  }
  
+ sub extract_header ($@) {
+   my $self = shift;
+   my @header = shift;
+   my $ctype = (split /\s+/,$header[0])[0];
+   $ctype =~ s/\;//;
+   return $ctype;
+ }
+ 
  sub extract_text ($$) {
      my $self = shift;
***************
*** 148,152 ****
      # clean up anchors and relative paths,etc, here.
      # need to deal with ../
!     if ($link =~ /^(.+)\#/o) {
        # anchor reference
        $link = $1;
--- 156,160 ----
      # clean up anchors and relative paths,etc, here.
      # need to deal with ../
!     if ($link && $link =~ /^(.+)\#/o) {
        # anchor reference
        $link = $1;
***************
*** 202,221 ****
        # slip through the cracks?
        $normalized_url = $baseurl . $link;
-       print "XXX MISSED $url XXX\n";
      }
  
      if ($normalized_url) {
-       # print "--->> $normalized_url\n";
        for my $c (split(//, $link)) {
! 	$o=ord($c);
! 	if ($o<128){
! 	  $new_link.=$c;
! 	} else {
! 	  $new_link="";
! 	  last;
! 	}
        }
-       
- 
        push(@links, $normalized_url);
      }
--- 210,225 ----
        # slip through the cracks?
        $normalized_url = $baseurl . $link;
      }
  
      if ($normalized_url) {
        for my $c (split(//, $link)) {
!         $o=ord($c);
! 	      if ($o<128){
! 	        $new_link.=$c;
! 	      } else {
! 	        $new_link="";
! 	        last;
! 	      }
        }
        push(@links, $normalized_url);
      }

Index: Sprawler.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler.pm,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -d -r1.7 -r1.8
*** Sprawler.pm	15 Mar 2004 05:15:12 -0000	1.7
--- Sprawler.pm	25 Mar 2004 04:40:36 -0000	1.8
***************
*** 85,92 ****
      $string=sprintf("$format");
    }
    if ($self->{LOGFILE}=~/(.*)/) {
      $self->{LOGFILE}=$1;
    }
!   open(LOG,">> $self->{LOGFILE}");
    print LOG "$timestamp $string";
    close;
--- 85,93 ----
      $string=sprintf("$format");
    }
+   # This looks redundant:
    if ($self->{LOGFILE}=~/(.*)/) {
      $self->{LOGFILE}=$1;
    }
!   open(LOG,">> $logfile");
    print LOG "$timestamp $string";
    close;

[Sprawler-commits] sprawler/docs to-do.txt,1.5,1.6

From: Eric A. <de...@us...> - 2004-03-25 04:51:20

Update of /cvsroot/sprawler/sprawler/docs
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26241/docs

Modified Files:
	to-do.txt 
Log Message:
- added function from Ilya to check headers for content types
- small bug fixes
- other little stuff


Index: to-do.txt
===================================================================
RCS file: /cvsroot/sprawler/sprawler/docs/to-do.txt,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** to-do.txt	15 Mar 2004 21:32:24 -0000	1.5
--- to-do.txt	25 Mar 2004 04:40:35 -0000	1.6
***************
*** 8,24 ****
       strong text, etc, etc)
  
-    o client needs to know what content-type we are getting, and decide to
-      download or not - otherwise, we end up downloading large binary files
-      and realizing they are not html (I think the web server can tell us if
-      it's text/html, or whatever)
- 
     o fix pick_lanquage method (Eric)
  
     o test and select an html parser (HTML:Parser,XML::Parser,
!      TokeParser, Pull Parser) based on efficency (open).
! 
!    o make method in Extractor to parse header info (open)
  
!    o methods for determining font clashes (ask Eric, open)
  
  
--- 8,17 ----
       strong text, etc, etc)
  
     o fix pick_lanquage method (Eric)
  
     o test and select an html parser (HTML:Parser,XML::Parser,
!      TokeParser, Pull Parser) based on efficency (Ilya).
  
!    o methods for determining font clashes (open)
  
  
***************
*** 62,65 ****
--- 55,65 ----
  Recently Completed: 
  -----------------
+    o make method in Extractor to parse header info (Ilya)
+ 
+    o client needs to know what content-type we are getting, and decide to
+      download or not - otherwise, we end up downloading large binary files
+      and realizing they are not html (I think the web server can tell us if
+      it's text/html, or whatever) (Ilya)
+      
     o added command line operations to indexer (client) to select config file,
       server name, server port, client id. (Eric)

[Sprawler-commits] sprawler indexer.conf,1.7,1.8 indexer.pl,1.24,1.25 master.pl,1.20,1.21

From: Eric A. <de...@us...> - 2004-03-25 04:51:20

Update of /cvsroot/sprawler/sprawler
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26241

Modified Files:
	indexer.conf indexer.pl master.pl 
Log Message:
- added function from Ilya to check headers for content types
- small bug fixes
- other little stuff


Index: indexer.pl
===================================================================
RCS file: /cvsroot/sprawler/sprawler/indexer.pl,v
retrieving revision 1.24
retrieving revision 1.25
diff -C2 -d -r1.24 -r1.25
*** indexer.pl	14 Mar 2004 05:54:25 -0000	1.24
--- indexer.pl	25 Mar 2004 04:40:35 -0000	1.25
***************
*** 93,97 ****
  @urlpaths=$client->get("URLPATHS");
  $reindex_interval=$client->get("REINDEX_INTERVAL");
! @index_ext=$client->get("INDEX_EXT");
  $contexts=$client->get("CONTEXTS");
  $cachesize=$client->get("MAXCACHEDSIZE");
--- 93,97 ----
  @urlpaths=$client->get("URLPATHS");
  $reindex_interval=$client->get("REINDEX_INTERVAL");
! @index_ext=$client->get("INDEX_TYPES");
  $contexts=$client->get("CONTEXTS");
  $cachesize=$client->get("MAXCACHEDSIZE");
***************
*** 99,107 ****
  
  
! print "index path: $index_path\n" if $debug;
  #print "document paths: @docpaths\n" if $debug;
  #print "url locations: @urlpaths\n" if $debug;
  #print "reindex interval (mins): $reindex_interval\n" if $debug;
! print "indexable extensions: @index_ext\n" if $debug;
  #print "known languages: @languages\n" if $debug;
  
--- 99,107 ----
  
  
! print "Index path: $index_path\n" if $debug;
  #print "document paths: @docpaths\n" if $debug;
  #print "url locations: @urlpaths\n" if $debug;
  #print "reindex interval (mins): $reindex_interval\n" if $debug;
! print "Indexable content types: @index_ext\n" if $debug;
  #print "known languages: @languages\n" if $debug;
  
***************
*** 129,133 ****
    	#print "indexing urls\n";
    	foreach $url (@urls) {
!     	#print "indexing $url\n";
        $client->indexer($url);
        $client->flush_db($url);
--- 129,133 ----
    	#print "indexing urls\n";
    	foreach $url (@urls) {
!     	#print "Indexing $url\n";
        $client->indexer($url);
        $client->flush_db($url);
***************
*** 135,139 ****
    	};
  
!   	print "sending indexes\n";
    	opendir(INDEX_PATH,"$index_path");
    	@files=grep {! /^\./}  readdir(INDEX_PATH);
--- 135,139 ----
    	};
  
!   	print "Sending indexes\n";
    	opendir(INDEX_PATH,"$index_path");
    	@files=grep {! /^\./}  readdir(INDEX_PATH);

Index: master.pl
===================================================================
RCS file: /cvsroot/sprawler/sprawler/master.pl,v
retrieving revision 1.20
retrieving revision 1.21
diff -C2 -d -r1.20 -r1.21
*** master.pl	15 Mar 2004 05:15:12 -0000	1.20
--- master.pl	25 Mar 2004 04:40:35 -0000	1.21
***************
*** 1,3 ****
! #!/usr/bin/perl -wT
  
  
--- 1,3 ----
! #!/usr/bin/perl -w
  
  

Index: indexer.conf
===================================================================
RCS file: /cvsroot/sprawler/sprawler/indexer.conf,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -d -r1.7 -r1.8
*** indexer.conf	10 Mar 2004 05:42:26 -0000	1.7
--- indexer.conf	25 Mar 2004 04:40:35 -0000	1.8
***************
*** 15,19 ****
  # interval in minutes
  reindex_interval = 1440 
! index_ext = html
  #index_ext = html,txt
  
--- 15,19 ----
  # interval in minutes
  reindex_interval = 1440 
! INDEX_TYPES = text/html
  #index_ext = html,txt

[Sprawler-commits] sprawler/lib/Sprawler Master.pm,1.34,1.35

From: Eric A. <de...@us...> - 2004-03-20 04:36:10

Update of /cvsroot/sprawler/sprawler/lib/Sprawler
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv21944/lib/Sprawler

Modified Files:
	Master.pm 
Log Message:
minor bugfixes. still hunting for the big one

Index: Master.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Master.pm,v
retrieving revision 1.34
retrieving revision 1.35
diff -C2 -d -r1.34 -r1.35
*** Master.pm	16 Mar 2004 04:19:47 -0000	1.34
--- Master.pm	20 Mar 2004 04:26:16 -0000	1.35
***************
*** 191,197 ****
    my $seed_file="./url_seed.txt";
    my @urls=();
-   my ($obj0, $fdesc0, $dbh0)=$self->open_db($url_file_statezero, "ZERO");
    
!   if (!(my ($key,$value) = each %{$dbh0})) {
      print "No urls in db to index! Seeding url index ...\n";
      open(SEEDFILE, "$seed_file");
--- 191,198 ----
    my $seed_file="./url_seed.txt";
    my @urls=();
    
!   
!   if (!(-e $url_file_statezero)) {
!     my ($obj0, $fdesc0, $dbh0)=$self->open_db($url_file_statezero, "ZERO");
      print "No urls in db to index! Seeding url index ...\n";
      open(SEEDFILE, "$seed_file");
***************
*** 209,216 ****
      my $urlsadded = @urls;
      print "$urlsadded urls added to seed list.\n";
    } else {
      print "Using original seed list.\n"
    }
-   $self->close_db($obj0, $fdesc0, "ZERO");
  }
  
--- 210,217 ----
      my $urlsadded = @urls;
      print "$urlsadded urls added to seed list.\n";
+     $self->close_db($obj0, $fdesc0, "ZERO");
    } else {
      print "Using original seed list.\n"
    }
  }

[Sprawler-commits] sprawler/docs how-to-index.txt,1.2,1.3

From: Eric A. <de...@us...> - 2004-03-16 23:28:22

Update of /cvsroot/sprawler/sprawler/docs
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv7756/docs

Modified Files:
	how-to-index.txt 
Log Message:
Added more functions to client.pm (indexes more types of data)


Index: how-to-index.txt
===================================================================
RCS file: /cvsroot/sprawler/sprawler/docs/how-to-index.txt,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** how-to-index.txt	15 Mar 2004 21:32:24 -0000	1.2
--- how-to-index.txt	16 Mar 2004 23:18:30 -0000	1.3
***************
*** 17,70 ****
  the "offline" index.
  
! Here's a list of info we need to grab from the page:
! Title <TITLE>
! Headings (text in <H1>, <H2>, and so on)
! Bold words <B>
! Large text <font size=+x>
! Italic words <i>
! Underlined words <ul>
! Linked words <a ..>
! Capitalized words (like This and like THIS)
! Words linked from other pages to the current page
! Word quantity
! word proximity - how close is one word to another
! size of page (in bytes)
! link/non-link text ratio - if half the page is links, how much content can there really be?
! URL of the page - including domain name
! Is the text at the top of the page more important than text at the bottom?
! If text color is same as background color - it's probably search engine fodder <font color=..>
! How many pages link to this page, and which pages 
! phone numbers (international ones, too)
! addresses
! email addresses
! domain names
! product numbers/model numbers
! ISBN book numbers (there's and algorithm for this)
! company names 
! meta description <meta desc..>
! meta keywords <meta keywords..>
! meta expires <meta expires..>
! filenames
! postal/zip codes 
! stock symbols
! abbreviations for province/state names
! em tagged words
! blinking words <blink>
! marquee words <marquee>
! small font words <font size=-x..>
! table headers <???>
! words in table data tags <td>...</td>
! alt tags (for commenting images) <a ... alt=xxxx>
! image file names <img src=xxxx >
! quoted words <??>
! block text quoted words <block>
! listed text words <li>
! preformatted text <pre>
! text/image ratio 
! individual words, and their frequency
! phrases (Panama canal routine)
! size of entire file
! size of data after html removed
! text/html ratio
  
  
--- 17,70 ----
  the "offline" index.
  
! Here's a list of info we need to grab from the page: (- is to do, * is done, and ? is unknown state)
! * Title <TITLE>
! * Headings (text in <H1>, <H2>, and so on)
! * Bold words <B>
! - Large text <font size=+x>
! * Italic words <i>
! - Underlined words <ul>
! ? Linked words <a ..>
! - Capitalized words (like This and like THIS)
! - Words linked from other pages to the current page
! - Word quantity
! - word proximity - how close is one word to another
! ? size of page (in bytes)
! - link/non-link text ratio - if half the page is links, how much content can there really be?
! * URL of the page - including domain name
! - Is the text at the top of the page more important than text at the bottom?
! - If text color is same as background color - it's probably search engine fodder <font color=..>
! - How many pages link to this page, and which pages 
! - phone numbers (international ones, too)
! - addresses (snail mail)
! * email addresses
! - domain names
! - product numbers/model numbers
! - ISBN book numbers (there's and algorithm for this)
! - company names 
! - meta description <meta desc..>
! - meta keywords <meta keywords..>
! - meta expires <meta expires..>
! - filenames
! - postal/zip codes 
! - stock symbols
! - abbreviations for province/state names
! - em tagged words <em>
! - blinking words <blink>
! * marquee words <marquee>
! - small font words <font size=-x..>
! - table headers <???>
! - words in table data tags <td>...</td>
! - alt tags (for commenting images) <a ... alt=xxxx>
! - image file names <img src=xxxx >
! - quoted words <??>
! * block text quoted words <block>
! * listed text words <li>
! * preformatted text <pre>
! - text/image ratio 
! - individual words, and their frequency
! - phrases (Panama canal routine)
! - size of entire file
! - size of data after html removed
! - text/html ratio

[Sprawler-commits] sprawler/lib/Sprawler Client.pm,1.35,1.36

From: Eric A. <de...@us...> - 2004-03-16 23:28:06

Update of /cvsroot/sprawler/sprawler/lib/Sprawler
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv7756/lib/Sprawler

Modified Files:
	Client.pm 
Log Message:
Added more functions to client.pm (indexes more types of data)


Index: Client.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Client.pm,v
retrieving revision 1.35
retrieving revision 1.36
diff -C2 -d -r1.35 -r1.36
*** Client.pm	10 Mar 2004 05:42:27 -0000	1.35
--- Client.pm	16 Mar 2004 23:18:31 -0000	1.36
***************
*** 337,340 ****
--- 337,367 ----
    @{$self->{TITLEWORDS}}=@titlewords;
  
+   @header=undef;
+   @{$self->{HEADERWORDS}}=undef;
+   $parser->set_tag('h1'); #header words
+   @headerwords=$self->clean($parser->get_words($parser -> extract_text ("$doctext")));
+   @{$self->{HEADERWORDS}}=@headerwords;
+   $parser->set_tag('h2');
+   @headerwords=$self->clean($parser->get_words($parser -> extract_text ("$doctext")));
+   push(@{$self->{HEADERWORDS}},@headerwords);
+   
+   @marqueewords=undef;
+   @{$self->{MARQUEEWORDS}}=undef;
+   $parser->set_tag('marquee'); #marquee words
+   @marqueewords=$self->clean($parser->get_words($parser -> extract_text ("$doctext")));
+   @{$self->{MARQUEEWORDS}}=@marqueewords;
+   
+   @prewords=undef;
+   @{$self->{PREWORDS}}=undef;
+   $parser->set_tag('pre'); #"pre"formatted words
+   @prewords=$self->clean($parser->get_words($parser -> extract_text ("$doctext")));
+   @{$self->{PREWORDS}}=@prewords;
+  
+   @liwords=undef;
+   @{$self->{LIWORDS}}=undef;
+   $parser->set_tag('li'); #"li"ne words
+   @liwords=$self->clean($parser->get_words($parser -> extract_text ("$doctext")));
+   @{$self->{LIWORDS}}=@liwords;
+ 
    @boldwords=undef;
    @{$self->{BOLDWORDS}}=undef;
***************
*** 364,368 ****
  
  
!   my @indextypes = ("TITLEWORDS", "BOLDWORDS", "ITALICWORDS","URLS","EMAILS");
    ${$self->{URL_DB}}{URL} = $document;
    foreach my $indextype (@indextypes) {
--- 391,395 ----
  
  
!   my @indextypes = ("TITLEWORDS", "BOLDWORDS", "ITALICWORDS", "URLS", "EMAILS", "PREWORDS", "LIWORDS", "HEADERWORDS", "MARQUEEWORDS");
    ${$self->{URL_DB}}{URL} = $document;
    foreach my $indextype (@indextypes) {

[Sprawler-commits] sprawler/lib/Sprawler Master.pm,1.33,1.34

From: Mojo N. <moj...@us...> - 2004-03-16 04:29:01

Update of /cvsroot/sprawler/sprawler/lib/Sprawler
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv23415/lib/Sprawler

Modified Files:
	Master.pm 
Log Message:
reposting commit.



Index: Master.pm
===================================================================
RCS file: /cvsroot/sprawler/sprawler/lib/Sprawler/Master.pm,v
retrieving revision 1.33
retrieving revision 1.34
diff -C2 -d -r1.33 -r1.34
*** Master.pm	15 Mar 2004 05:15:12 -0000	1.33
--- Master.pm	16 Mar 2004 04:19:47 -0000	1.34
***************
*** 149,152 ****
--- 149,155 ----
    $db_file=$index_path.$filename;
    $tmp_file=$index_path.$filename.".mem";
+   if ( $tmp_file =~ /(.*)/ ) {
+     $tmp_file=1;
+   }
  
    open(INDEX, "> $tmp_file") or die "NO FILE $!\n";

Flat | Threaded

1 2 3 .. 7 > >> (Page 1 of 7)

2003	Jan	Feb	Mar	Apr	May	Jun	Jul	Aug	Sep (7)	Oct (1)	Nov	Dec
2004	Jan (41)	Feb (40)	Mar (55)	Apr (1)	May	Jun (3)	Jul	Aug (4)	Sep	Oct	Nov	Dec