Patch adds poll timer scheduler to be called 3 times per token timeout. If poll timer was not called for more then 0.8 * token timeout, it means corosync process was not scheduled and ether token_timeout should be increased or load should be reduced (useful for VM, where host is overcommitted so VM is not scheduled as expected). Signed-off-by: Jan Friesse <jfriesse@xxxxxxxxxx> --- exec/main.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 50 insertions(+), 0 deletions(-) diff --git a/exec/main.c b/exec/main.c index d23e244..42ffb7b 100644 --- a/exec/main.c +++ b/exec/main.c @@ -1295,6 +1295,49 @@ static struct coroipcs_init_state_v2 ipc_init_state_v2 = { .stats_decrement_value = corosync_stats_decrement_value, }; +struct scheduler_pause_timeout_data { + struct totem_config *totem_config; + poll_timer_handle handle; + unsigned long long tv_prev; + unsigned long long max_tv_diff; +}; + +static void timer_function_scheduler_timeout (void *data) +{ + struct scheduler_pause_timeout_data *timeout_data = (struct scheduler_pause_timeout_data *)data; + unsigned long long tv_current; + unsigned long long tv_diff; + + tv_current = timerlist_nano_current_get (); + + if (timeout_data->tv_prev == 0) { + /* + * Initial call -> just pretent everything is ok + */ + timeout_data->tv_prev = tv_current; + timeout_data->max_tv_diff = 0; + } + + tv_diff = tv_current - timeout_data->tv_prev; + timeout_data->tv_prev = tv_current; + + if (tv_diff > timeout_data->max_tv_diff) { + log_printf (LOGSYS_LEVEL_WARNING, "Corosync main process was not scheduled for %0.4f ms " + "(threshold is %0.4f ms). Consider token timeout increase.", + (float)tv_diff / TIMERLIST_NS_IN_MSEC, (float)timeout_data->max_tv_diff / TIMERLIST_NS_IN_MSEC); + } + + /* + * Set next threshold, because token_timeout can change + */ + timeout_data->max_tv_diff = timeout_data->totem_config->token_timeout * TIMERLIST_NS_IN_MSEC * 0.8; + poll_timer_add (corosync_poll_handle, + timeout_data->totem_config->token_timeout / 3, + timeout_data, + timer_function_scheduler_timeout, + &timeout_data->handle); +} + static void corosync_setscheduler (void) { #if defined(HAVE_PTHREAD_SETSCHEDPARAM) && defined(HAVE_SCHED_GET_PRIORITY_MAX) && defined(HAVE_SCHED_SETSCHEDULER) @@ -1556,6 +1599,7 @@ error_close: return (err); } + int main (int argc, char **argv, char **envp) { const char *error_string; @@ -1576,6 +1620,7 @@ int main (int argc, char **argv, char **envp) char corosync_lib_dir[PATH_MAX]; hdb_handle_t object_runtime_handle; enum e_ais_done flock_err; + struct scheduler_pause_timeout_data scheduler_pause_timeout_data; /* default configuration */ @@ -1788,9 +1833,14 @@ int main (int argc, char **argv, char **envp) serialize_unlock, sched_priority); + corosync_poll_handle = poll_create (); poll_low_fds_event_set(corosync_poll_handle, main_low_fds_event); + memset(&scheduler_pause_timeout_data, 0, sizeof(scheduler_pause_timeout_data)); + scheduler_pause_timeout_data.totem_config = &totem_config; + timer_function_scheduler_timeout (&scheduler_pause_timeout_data); + /* * Create exit pipe */ -- 1.7.1 _______________________________________________ discuss mailing list discuss@xxxxxxxxxxxx http://lists.corosync.org/mailman/listinfo/discuss