On Fri, 2006-08-04 at 12:44 +0200, Alain Moulle wrote: > Hi Ron, > > could you provide me the defects numbers and/or linked patches ? Here's the current list of pending fixes: http://bugzilla.redhat.com/bugzilla/buglist.cgi?component=rgmanager&bug_status=MODIFIED&bug_status=FAILS_QA&bug_status=ON_QA The patch for internal self-monitoring was simply a backport from the HEAD branch. I've attached a hand-edited patch which should enable the self-monitoring bit. Additionally, there was a segfault fixed in U3. Here's the errata advisory, which contains links to bugzillas: https://rhn.redhat.com/errata/RHBA-2006-0241.html -- Lon
Index: Makefile =================================================================== RCS file: /cvs/cluster/cluster/rgmanager/src/daemons/Makefile,v retrieving revision 1.6.2.3 retrieving revision 1.6.2.5 diff -u -r1.6.2.3 -r1.6.2.5 --- Makefile 17 Oct 2005 20:23:52 -0000 1.6.2.3 +++ Makefile 26 May 2006 17:39:32 -0000 1.6.2.5 @@ -40,7 +40,8 @@ clurgmgrd: rg_thread.o rg_locks.o main.o groups.o rg_state.o \ rg_queue.o members.o rg_forward.o reslist.o \ - resrules.o restree.o fo_domain.o + resrules.o restree.o fo_domain.o nodeevent.o \ + watchdog.o $(CC) -o $@ $^ $(INCLUDE) $(CFLAGS) $(LDFLAGS) -lccs # Index: main.c =================================================================== RCS file: /cvs/cluster/cluster/rgmanager/src/daemons/main.c,v retrieving revision 1.9.2.12.2.1 retrieving revision 1.9.2.17 diff -u -r1.9.2.12.2.1 -r1.9.2.17 --- main.c 25 Jan 2006 18:52:33 -0000 1.9.2.12.2.1 +++ main.c 26 May 2006 17:39:32 -0000 1.9.2.17 @@ -44,9 +48,10 @@ void graceful_exit(int); void flag_shutdown(int sig); void hard_exit(void); -int send_rg_states(int); +int send_rg_states(int ); int check_config_update(void); int svc_exists(char *); +int watchdog_init(void); int shutdown_pending = 0, running = 1, need_reconfigure = 0; char debug = 0; /* XXX* */ @@ -646,8 +735,11 @@ if (foreground) clu_log_console(1); - if (!foreground && (geteuid() == 0)) + if (!foreground && (geteuid() == 0)) { daemon_init(argv[0]); + if (!debug && !watchdog_init()) + clulog(LOG_NOTICE, "Failed to start watchdog\n"); + } /* We need quorum before we can read the configuration data from Index: watchdog.c =================================================================== RCS file: watchdog.c diff -N watchdog.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ watchdog.c 26 May 2006 17:39:32 -0000 1.1.2.1 @@ -0,0 +1,97 @@ +/* + Copyright Red Hat, Inc. 2005-2006 + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) any + later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 675 Mass Ave, Cambridge, + MA 02139, USA. +*/ +#include <unistd.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/reboot.h> +#include <stdlib.h> + +#include <signals.h> +#include <clulog.h> + +static pid_t child = 0; + +static void +signal_handler(int signum) +{ + kill(child, signum); +} +static void +redirect_signals(void) +{ + int i; + for (i = 0; i < _NSIG; i++) { + switch (i) { + case SIGCHLD: + case SIGILL: + case SIGFPE: + case SIGSEGV: + case SIGBUS: + setup_signal(i, SIG_DFL); + break; + default: + setup_signal(i, signal_handler); + } + } +} + +/** + return watchdog's pid, or 0 on failure +*/ +int +watchdog_init(void) +{ + int status; + pid_t parent; + + parent = getpid(); + child = fork(); + if (child < 0) + return 0; + else if (!child) + return parent; + + redirect_signals(); + + while (1) { + if (waitpid(child, &status, 0) <= 0) + continue; + + if (WIFEXITED(status)) + exit(WEXITSTATUS(status)); + + if (WIFSIGNALED(status)) { + if (WTERMSIG(status) == SIGKILL) { + clulog(LOG_CRIT, "Watchdog: Daemon killed, exiting\n"); + raise(SIGKILL); + while(1) ; + } + else { +#ifdef DEBUG + clulog(LOG_CRIT, "Watchdog: Daemon died, but not rebooting because DEBUG is set\n"); +#else + clulog(LOG_CRIT, "Watchdog: Daemon died, rebooting...\n"); + sync(); + reboot(RB_AUTOBOOT); +#endif + exit(255); + } + } + } +}
-- Linux-cluster@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/linux-cluster