|
From: <buc...@us...> - 2010-03-10 22:39:00
|
Revision: 177
http://devmon.svn.sourceforge.net/devmon/?rev=177&view=rev
Author: buchanmilne
Date: 2010-03-10 22:38:54 +0000 (Wed, 10 Mar 2010)
Log Message:
-----------
Make the master "ping" the idle forks to check that IPC works, and kill the forks
if there are any errors
Modified Paths:
--------------
trunk/modules/dm_snmp.pm
Modified: trunk/modules/dm_snmp.pm
===================================================================
--- trunk/modules/dm_snmp.pm 2010-03-10 22:28:03 UTC (rev 176)
+++ trunk/modules/dm_snmp.pm 2010-03-10 22:38:54 UTC (rev 177)
@@ -320,6 +320,90 @@
++$active_forks;
$g{'forks'}{$fork}{'time'} = time;
}
+
+ # If our fork is idle and has been for more than the cycle time
+ # make sure it is still alive
+ if(!defined $g{'forks'}{$fork}{'dev'}) {
+ my $idletime = time - $g{'forks'}{$fork}{'time'};
+ next if ($idletime <= $g{'cycletime'});
+ if (defined $g{'forks'}{$fork}{'pinging'}) {
+ do_log("DEBUG SNMP: Fork $fork was pinged, checking for reply",4) if $g{'debug'};
+ my $select = IO::Select->new($g{'forks'}{$fork}{'CS'});
+ if($select->can_read(0.01)) {
+
+ do_log("DEBUG SNMP: Fork $fork has data, reading it",4) if $g{'debug'};
+ # Okay, we know we have something in the buffer, keep reading
+ # till we get an EOF
+ my $data_in = '';
+ eval {
+ local $SIG{ALRM} = sub { die "Timeout waiting for EOF from fork" };
+ alarm 5;
+ do {
+ my $read = $g{'forks'}{$fork}{'CS'}->getline();
+ if(defined $read and $read ne '') {$data_in .= $read}
+ else {select undef, undef, undef, 0.001}
+ } until $data_in =~ s/\nEOF\n$//s;
+ alarm 0;
+ };
+ if($@) {
+ do_log("Fork $fork, pid $g{'forks'}{$fork}{'pid'} stalled on reply to ping: $@. Killing this fork.");
+ kill 15, $g{'forks'}{$fork}{'pid'} or do_log("Sending $fork TERM signal failed: $!");
+ close $g{'forks'}{$fork}{'CS'} or do_log("Closing socket to fork $fork failed: $!");
+ delete $g{'forks'}{$fork};
+ next;
+ }
+ do_log("DEBUG SNMP: Fork $fork returned complete message for ping request",4) if $g{'debug'};
+
+ my $hashref = thaw($data_in);
+ my %returned;
+ if (defined $hashref) {
+ do_log("DEBUG SNMP: Dethawing data for ping of fork $fork",4) if $g{'debug'};
+ %returned = %{ thaw($data_in) };
+ }
+ else {
+ print "failed thaw for ping of fork $fork\n";
+ next;
+ }
+ if (defined $returned{'pong'}) {
+ $g{'forks'}{$fork}{'time'} = time;
+ do_log("Fork $fork responded to ping request $returned{'ping'} with $returned{'pong'} at $g{'forks'}{$fork}{'time'}",4) if $g{'debug'};
+ delete $g{'forks'}{$fork}{'pinging'};
+ } else {
+ do_log("Fork $fork didnt send an appropriate response, killing it",4) if $g{'debug'};
+ kill 15, $g{'forks'}{$fork}{'pid'} or do_log("Sending $fork TERM signal failed: $!");
+ close $g{'forks'}{$fork}{'CS'} or do_log("Closing socket to fork $fork failed: $!");
+ delete $g{'forks'}{$fork};
+ next;
+ }
+
+ } else {
+ do_log("DEBUG SNMP: Fork $fork seems not to have replied to our ping, killing it",4);
+ kill 15, $g{'forks'}{$fork}{'pid'} or do_log("Sending $fork TERM signal failed: $!");
+ close $g{'forks'}{$fork}{'CS'} or do_log("Closing socket to fork $fork failed: $!");
+ delete $g{'forks'}{$fork};
+ next;
+ }
+
+ } else {
+ my %ping_input = ('ping' => time);
+ do_log("Fork $fork has been idle for more than cycle time, pinging it at $ping_input{'ping'}",4) if $g{'debug'};
+ my $serialized = nfreeze(\%ping_input);
+ eval {
+ local $SIG{ALRM} = sub { die "Timeout sending polling task data to fork\n" };
+ alarm 15;
+ $g{'forks'}{$fork}{'CS'}->print("$serialized\nEOF\n");
+ alarm 0;
+ };
+ if($@) {
+ do_log("Fork $g{'forks'}{$fork}, pid $g{'forks'}{$fork}{'pid'} not responding: $@. Killing this fork.");
+ kill 15, $g{'forks'}{$fork}{'pid'} or do_log("Sending TERM signal to fork $fork failed: $!");
+ close $g{'forks'}{$fork}{'CS'} or do_log("Closing socket to fork $fork failed: $!");
+ delete $g{'forks'}{$fork};
+ next;
+ }
+ $g{'forks'}{$fork}{'pinging'} = 1;
+ }
+ }
}
}
}
@@ -410,16 +494,40 @@
# Our getline timed out, which means we haven't gotten any data
# in a while. Make sure our parent is still there
if($@) {
- exit 1 if !kill 0, $g{'mypid'};
+ do_log("Fork $fork_num timed out waiting for data from parent: $@",4);
+ if (!kill 0, $g{'mypid'}) {
+ do_log("Parent is no longer running, fork $fork_num exiting");
+ exit 1;
+ }
}
$serialized .= $string_in if defined $string_in;
} until $serialized =~ s/\nEOF\n$//s;
+ do_log("DEBUG SNMP($fork_num): Got EOF in message, attempting to thaw",4) if $g{'debug'};
# Now decode our serialized data scalar
- my %data_in = %{thaw($serialized)};
+ my %data_in;
+ eval {
+ %data_in = %{thaw($serialized)};
+ };
+ if ($@) {
+ do_log("DEBUG SNMP($fork_num): thaw failed attempting to thaw $serialized: $@",4) if $g{'debug'};
+ do_log("DEBUG SNMP($fork_num): Replying to corrupt message with a pong",4) if $g{'debug'};
+ $data_out{'ping'} = '0';
+ $data_out{'pong'} = time;
+ send_data($sock,\%data_out);
+ next DEVICE;
+ }
+ if (defined $data_in{'ping'}) {
+ do_log("DEBUG SNMP($fork_num): Received ping from master $data_in{'ping'},replying",4) if $g{'debug'};
+ $data_out{'ping'} = $data_in{'ping'};
+ $data_out{'pong'} = time;
+ send_data($sock,\%data_out);
+ next DEVICE;
+ }
+
# Get SNMP variables
my $snmp_cid = $data_in{'cid'};
my $snmp_port = $data_in{'port'} || 161; # Default to 161 if not specified
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|