And, ps -eaf |grep rg root 26354 26349 0 11:08 ? 00:00:00 /bin/sh /etc/rc6.d/K01rgmanager stop > strace -p 26354 Process 26354 attached - interrupt to quit wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 30924 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 --- SIGCHLD (Child exited) @ 0 (0) --- wait4(-1, 0x7fbfffd654, WNOHANG, NULL) = -1 ECHILD (No child processes) rt_sigreturn(0xffffffffffffffff) = 0 rt_sigaction(SIGINT, {SIG_DFL}, {0x432ba0, [], SA_RESTORER, 0x3e1012e380}, 8) = 0 pipe([3, 4]) = 0 rt_sigprocmask(SIG_BLOCK, [INT CHLD], [], 8) = 0 clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x2a95574470) = 30926 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 rt_sigaction(SIGCHLD, {0x433e50, [], SA_RESTORER, 0x3e1012e380}, {0x433e50, [], SA_RESTORER, 0x3e1012e380}, 8) = 0 close(4) = 0 read(3, "25824\n", 128) = 6 read(3, "", 128) = 0 --- SIGCHLD (Child exited) @ 0 (0) --- wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 30926 wait4(-1, 0x7fbfffd404, WNOHANG, NULL) = -1 ECHILD (No child processes) rt_sigreturn(0xffffffffffffffff) = 0 close(3) = 0 rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0 rt_sigaction(SIGINT, {0x432ba0, [], SA_RESTORER, 0x3e1012e380}, {SIG_DFL}, 8) = 0 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 rt_sigaction(SIGINT, {SIG_DFL}, {0x432ba0, [], SA_RESTORER, 0x3e1012e380}, 8) = 0 rt_sigprocmask(SIG_BLOCK, NULL, [], 8) = 0 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 stat("/bin/sleep", {st_mode=S_IFREG|0755, st_size=22040, ...}) = 0 access("/bin/sleep", X_OK) = 0 rt_sigprocmask(SIG_BLOCK, [INT CHLD], [], 8) = 0 clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x2a95574470) = 30927 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0 rt_sigaction(SIGINT, {0x432ba0, [], SA_RESTORER, 0x3e1012e380}, {SIG_DFL}, 8) = 0 wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 30927 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 --- SIGCHLD (Child exited) @ 0 (0) --- ... On Mon, 24 Jul 2006, Jie Gao wrote: > Date: Mon, 24 Jul 2006 11:29:13 +1000 (EST) > From: Jie Gao <J.Gao@xxxxxxxxxxxxxxx> > Reply-To: linux clustering <linux-cluster@xxxxxxxxxx> > To: linux clustering <linux-cluster@xxxxxxxxxx> > Subject: clurgmgrd refuses to die > > Hi All > > I have been having a persistent problem with shutting down a cluster node. > > I have a two-node cluster. If Node A starts first, there is no problem > rebooting Node B at any time. But if I try to reboot Node A, it hangs > while trying to kill "clurgmgrd": > > > ps -eaf |grep clurgmgrd > 116:root 25824 1 0 10:45 ? 00:00:00 clurgmgrd > > > strace -f kill -TERM 25824 > execve("/bin/kill", ["kill", "-TERM", "25824"], [/* 28 vars */]) = 0 > uname({sys="Linux", node="mix", ...}) = 0 > brk(0) = 0x503000 > mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2a95556000 > access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory) > open("/etc/ld.so.cache", O_RDONLY) = 3 > fstat(3, {st_mode=S_IFREG|0644, st_size=114663, ...}) = 0 > mmap(NULL, 114663, PROT_READ, MAP_PRIVATE, 3, 0) = 0x2a95557000 > close(3) = 0 > open("/lib64/tls/libc.so.6", O_RDONLY) = 3 > read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0`\305\21"..., 832) = 832 > fstat(3, {st_mode=S_IFREG|0755, st_size=1493186, ...}) = 0 > mmap(0x3e10100000, 2310056, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3e10100000 > mprotect(0x3e1022b000, 1085352, PROT_NONE) = 0 > mmap(0x3e1032a000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x12a000) = > 0x3e1032a000 > mmap(0x3e10330000, 16296, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x3e10330000 > close(3) = 0 > mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2a95573000 > mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2a95574000 > mprotect(0x3e1032a000, 12288, PROT_READ) = 0 > arch_prctl(ARCH_SET_FS, 0x2a95573b00) = 0 > munmap(0x2a95557000, 114663) = 0 > open("/usr/lib/locale/locale-archive", O_RDONLY) = 3 > fstat(3, {st_mode=S_IFREG|0644, st_size=48516832, ...}) = 0 > mmap(NULL, 48516832, PROT_READ, MAP_PRIVATE, 3, 0) = 0x2a95575000 > close(3) = 0 > brk(0) = 0x503000 > brk(0x524000) = 0x524000 > kill(25824, SIGTERM) = 0 > exit_group(0) = ? > Process 28578 detached > > > ps -eaf |grep clurgmgrd > 116:root 25824 1 0 10:45 ? 00:00:00 clurgmgrd > > > strace -p 25824 > Process 25824 attached - interrupt to quit > select(7, [4 5 6], NULL, NULL, {7, 735000}) = 0 (Timeout) > socket(PF_FILE, SOCK_STREAM, 0) = 9 > connect(9, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0 > write(9, "\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 20) = 20 > read(9, "\1\0\0\0\0\0\0\0\210\35\0\0\0\0\0\0\0\0\0\0", 20) = 20 > close(9) = 0 > socket(PF_FILE, SOCK_STREAM, 0) = 9 > connect(9, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0 > write(9, "\3\0\0\0\0\0\0\0\210\35\0\0\0\0\0\0\31\0\0\0/cluster/@"..., 45) = 45 > read(9, "\3\0\0\0\0\0\0\0\210\35\0\0\0\0\0\0\3\0\0\0", 20) = 20 > read(9, "30\0", 3) = 3 > close(9) = 0 > socket(PF_FILE, SOCK_STREAM, 0) = 9 > connect(9, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0 > write(9, "\2\0\0\0\0\0\0\0\210\35\0\0\0\0\0\0\0\0\0\0", 20) = 20 > read(9, "\2\0\0\0\0\0\0\0\377\377\377\377\0\0\0\0\0\0\0\0", 20) = 20 > close(9) = 0 > select(7, [6], [6], NULL, {0, 0}) = 0 (Timeout) > select(6, [5], [5], NULL, {0, 0}) = 0 (Timeout) > select(7, [4 5 6], NULL, NULL, {10, 0}) = 0 (Timeout) > socket(PF_FILE, SOCK_STREAM, 0) = 9 > connect(9, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0 > write(9, "\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 20) = 20 > read(9, "\1\0\0\0\0\0\0\0\246\35\0\0\0\0\0\0\0\0\0\0", 20) = 20 > close(9) = 0 > socket(PF_FILE, SOCK_STREAM, 0) = 9 > connect(9, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0 > write(9, "\3\0\0\0\0\0\0\0\246\35\0\0\0\0\0\0\31\0\0\0/cluster/@"..., 45) = 45 > read(9, "\3\0\0\0\0\0\0\0\246\35\0\0\0\0\0\0\3\0\0\0", 20) = 20 > read(9, "30\0", 3) = 3 > close(9) = 0 > socket(PF_FILE, SOCK_STREAM, 0) = 9 > connect(9, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0 > write(9, "\2\0\0\0\0\0\0\0\246\35\0\0\0\0\0\0\0\0\0\0", 20) = 20 > read(9, "\2\0\0\0\0\0\0\0\377\377\377\377\0\0\0\0\0\0\0\0", 20) = 20 > close(9) = 0 > select(7, [6], [6], NULL, {0, 0}) = 0 (Timeout) > select(6, [5], [5], NULL, {0, 0}) = 0 (Timeout) > select(7, [4 5 6], NULL, NULL, {10, 0}) = 0 (Timeout) > socket(PF_FILE, SOCK_STREAM, 0) = 9 > connect(9, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0 > write(9, "\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 20) = 20 > read(9, "\1\0\0\0\0\0\0\0\304\35\0\0\0\0\0\0\0\0\0\0", 20) = 20 > close(9) = 0 > socket(PF_FILE, SOCK_STREAM, 0) = 9 > connect(9, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0 > write(9, "\3\0\0\0\0\0\0\0\304\35\0\0\0\0\0\0\31\0\0\0/cluster/@"..., 45) = 45 > read(9, "\3\0\0\0\0\0\0\0\304\35\0\0\0\0\0\0\3\0\0\0", 20) = 20 > read(9, "30\0", 3) = 3 > close(9) = 0 > socket(PF_FILE, SOCK_STREAM, 0) = 9 > connect(9, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0 > write(9, "\2\0\0\0\0\0\0\0\304\35\0\0\0\0\0\0\0\0\0\0", 20) = 20 > read(9, "\2\0\0\0\0\0\0\0\377\377\377\377\0\0\0\0\0\0\0\0", 20) = 20 > close(9) = 0 > ... > > > What is clurgmgrd exactly trying to do? > > Regards, > > > Jie > > -- > > Linux-cluster@xxxxxxxxxx > https://www.redhat.com/mailman/listinfo/linux-cluster > -- Linux-cluster@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/linux-cluster