From: <ag...@us...> - 2011-12-12 14:27:27
|
Revision: 1850 http://nagios.svn.sourceforge.net/nagios/?rev=1850&view=rev Author: ageric Date: 2011-12-12 14:27:16 +0000 (Mon, 12 Dec 2011) Log Message: ----------- core: Avoid insane looping when rescheduling checks Previously we used to iterate over the entire event queue to see if the object we were about to reschedule a check for was already scheduled or not. Since checks are normally rescheduled just after they've been run, that means we never hit an event inside the list and so we walk all of it without ever finding it, causing O(n) worst-case complexity and a probability of hitting that worst-case complexity of very nearly 100%. With this patch, we reuse the currently unused nexthash field of hosts and service objects to stash the next scheduled check event, providing constant-time access to it and avoiding the insane looping, which should reduce Nagios' cpu consumption substantially, since almost 80% of Nagios' user cpu time is spent on this loop. Relevant gprof output before patch: % time cumulat self calls ms/call ms/call name 17.24 0.05 0.05 3490 0.01 0.01 schedule_service_check 17.24 0.10 0.05 1 50.00 261.97 event_execution_loop Relevant gprof output after patch: % time cumulat self calls ms/call ms/call name 12.20 0.05 0.05 1 50.00 352.26 event_execution_loop 9.76 0.09 0.04 10565 0.00 0.00 clear_host_macros_r .... 0.00 0.41 0.00 3594 0.00 0.01 schedule_service_check >From 17.24% of the time used to less than 0.01 is quite good, so we stick with that and let it be enough. Testing is ofcourse required to make sure checks aren't being run extremely frequently, but that shouldn't be the case since the new check is assigned to the scheduled object in case we replace it with a forced one. Reported-by: Mathias Kettner <mk...@ma...> Signed-off-by: Andreas Ericsson <ae...@op...> Modified Paths: -------------- nagioscore/trunk/base/checks.c nagioscore/trunk/include/objects.h Modified: nagioscore/trunk/base/checks.c =================================================================== --- nagioscore/trunk/base/checks.c 2011-12-12 14:26:56 UTC (rev 1849) +++ nagioscore/trunk/base/checks.c 2011-12-12 14:27:16 UTC (rev 1850) @@ -249,6 +249,12 @@ log_debug_info(DEBUGL_FUNCTIONS, 0, "run_scheduled_service_check() start\n"); log_debug_info(DEBUGL_CHECKS, 0, "Attempting to run scheduled check of service '%s' on host '%s': check options=%d, latency=%lf\n", svc->description, svc->host_name, check_options, latency); + /* + * reset the next_check_event so we know it's + * no longer in the scheduling queue + */ + svc->next_check_event = NULL; + /* attempt to run the check */ result = run_async_service_check(svc, check_options, latency, TRUE, TRUE, &time_is_valid, &preferred_time); @@ -1656,7 +1662,6 @@ void schedule_service_check(service *svc, time_t check_time, int options) { timed_event *temp_event = NULL; timed_event *new_event = NULL; - int found = FALSE; int use_original_event = TRUE; log_debug_info(DEBUGL_FUNCTIONS, 0, "schedule_service_check()\n"); @@ -1683,25 +1688,15 @@ /* default is to use the new event */ use_original_event = FALSE; - found = FALSE; -#ifdef PERFORMANCE_INCREASE_BUT_VERY_BAD_IDEA_INDEED - /* WARNING! 1/19/07 on-demand async service checks will end up causing mutliple scheduled checks of a service to appear in the queue if the code below is skipped */ - /* if(use_large_installation_tweaks==FALSE)... skip code below */ -#endif + temp_event = (timed_event *)svc->next_check_event; - /* see if there are any other scheduled checks of this service in the queue */ - for(temp_event = event_list_low; temp_event != NULL; temp_event = temp_event->next) { + /* + * If the service already has a check scheduled, + * we need to decide which of the events to use + */ + if(temp_event != NULL) { - if(temp_event->event_type == EVENT_SERVICE_CHECK && svc == (service *)temp_event->event_data) { - found = TRUE; - break; - } - } - - /* we found another service check event for this service in the queue - what should we do? */ - if(found == TRUE && temp_event != NULL) { - log_debug_info(DEBUGL_CHECKS, 2, "Found another service check event for this service @ %s", ctime(&temp_event->run_time)); /* use the originally scheduled check unless we decide otherwise */ @@ -1747,6 +1742,7 @@ else { remove_event(temp_event, &event_list_low, &event_list_low_tail); my_free(temp_event); + svc->next_check_event = new_event; } } @@ -2163,7 +2159,6 @@ void schedule_host_check(host *hst, time_t check_time, int options) { timed_event *temp_event = NULL; timed_event *new_event = NULL; - int found = FALSE; int use_original_event = TRUE; @@ -2190,24 +2185,15 @@ /* default is to use the new event */ use_original_event = FALSE; - found = FALSE; -#ifdef PERFORMANCE_INCREASE_BUT_VERY_BAD_IDEA_INDEED - /* WARNING! 1/19/07 on-demand async host checks will end up causing mutliple scheduled checks of a host to appear in the queue if the code below is skipped */ - /* if(use_large_installation_tweaks==FALSE)... skip code below */ -#endif + temp_event = (timed_event *)hst->next_check_event; - /* see if there are any other scheduled checks of this host in the queue */ - for(temp_event = event_list_low; temp_event != NULL; temp_event = temp_event->next) { - if(temp_event->event_type == EVENT_HOST_CHECK && hst == (host *)temp_event->event_data) { - found = TRUE; - break; - } - } + /* + * If the host already had a check scheduled we need + * to decide which check event to use + */ + if(temp_event != NULL) { - /* we found another host check event for this host in the queue - what should we do? */ - if(found == TRUE && temp_event != NULL) { - log_debug_info(DEBUGL_CHECKS, 2, "Found another host check event for this host @ %s", ctime(&temp_event->run_time)); /* use the originally scheduled check unless we decide otherwise */ @@ -2253,6 +2239,7 @@ else { remove_event(temp_event, &event_list_low, &event_list_low_tail); my_free(temp_event); + hst->next_check_event = new_event; } } @@ -2881,6 +2868,12 @@ log_debug_info(DEBUGL_CHECKS, 0, "Attempting to run scheduled check of host '%s': check options=%d, latency=%lf\n", hst->name, check_options, latency); + /* + * reset the next_check_event so we know this host + * check is no longer in the scheduling queue + */ + hst->next_check_event = NULL; + /* attempt to run the check */ result = run_async_host_check_3x(hst, check_options, latency, TRUE, TRUE, &time_is_valid, &preferred_time); Modified: nagioscore/trunk/include/objects.h =================================================================== --- nagioscore/trunk/include/objects.h 2011-12-12 14:26:56 UTC (rev 1849) +++ nagioscore/trunk/include/objects.h 2011-12-12 14:27:16 UTC (rev 1850) @@ -383,7 +383,7 @@ objectlist *hostgroups_ptr; #endif struct host_struct *next; - struct host_struct *nexthash; + void *next_check_event; }; @@ -514,7 +514,7 @@ objectlist *servicegroups_ptr; #endif struct service_struct *next; - struct service_struct *nexthash; + void *next_check_event; }; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |