lastest dmesg entry --
hal[3509]: segfault at 0000000000000000 rip 0000000000400ec7 rsp 0000007fbfffd7e0 error 4
grep clurgmgrd /var/log/messages --
[snip]
Dec 11 06:39:43 bamf01 clurgmgrd: [7983]: <info> Executing /etc/init.d/rsyncd-tiger status
Dec 11 06:39:44 bamf01 clurgmgrd: [7983]: <info> Executing /etc/init.d/httpd.cluster status
Dec 11 06:39:44 bamf01 clurgmgrd: [7983]: <info> Executing /etc/init.d/rsyncd-hartigan status
Dec 11 06:41:11 bamf01 clurgmgrd[7983]: <err> #48: Unable to obtain cluster lock: Connection timed out
Dec 11 06:41:56 bamf01 clurgmgrd[7983]: <err> #50: Unable to obtain cluster lock: Connection timed out
[snip]
On 12/11/06, aberoham@xxxxxxxxx <aberoham@xxxxxxxxx
> wrote:
I have a five node cluster, RHEL4.4 with latest errata. One node is telling me "Timed out waiting for a response from Resource Group Manager" when I run clustat, and if I strace one of its clurgmgrd PIDs it seems to be stuck at a futex locking call ---
root@bamf01:~
(1)>clustat
Timed out waiting for a response from Resource Group Manager
Member Status: Quorate
Member Name Status
------ ---- ------
bamf01 Online, Local, rgmanager
bamf02 Online
bamf03 Online, rgmanager
bamf04 Online, rgmanager
bamf05 Online, rgmanager
Other nodes are fine:
root@bamf03:/etc/init.d
(0)>clustat
Member Status: Quorate
Member Name Status
------ ---- ------
bamf01 Online, rgmanager
bamf02 Online
bamf03 Online, Local, rgmanager
bamf04 Online, rgmanager
bamf05 Online, rgmanager
Service Name Owner (Last) State
------- ---- ----- ------ -----
goat-design bamf05 started
cougar-compout bamf05 started
cheetah-renderout bamf01 started
postgresql-blur bamf04 started
tiger-jukebox bamf01 started
hartigan-home bamf01 started
cman_tool status on rgmanger-failed node (namf01) matches cman_tool status on other nodes besides "Active Subsytems" counts. Difference is that node with failed rgmanger is running service that uses GFS, so its has +4 active subsystems, two DLM lock spaces for a gfs fs and two gfs mount groups --
root@bamf01:~
(0)>cman_tool status
Protocol version: 5.0.1
Config version: 34
Cluster name: bamf
Cluster ID: 1492
Cluster Member: Yes
Membership state: Cluster-Member
Nodes: 5
Expected_votes: 5
Total_votes: 5
Quorum: 3
Active subsystems: 8
Node name: bamf01
Node ID: 2
Node addresses: 10.0.19.21
root@bamf05:~
(0)>cman_tool status
Protocol version: 5.0.1
Config version: 34
Cluster name: bamf
Cluster ID: 1492
Cluster Member: Yes
Membership state: Cluster-Member
Nodes: 5
Expected_votes: 5
Total_votes: 5
Quorum: 3
Active subsystems: 5
Node name: bamf05
Node ID: 4
Node addresses: 10.0.19.25
root@bamf01:~
(0)>cman_tool services
Service Name GID LID State Code
Fence Domain: "default" 1 2 run -
[2 1 4 5 3]
DLM Lock Space: "clvmd" 2 3 update U-4,1,3
[1 2 4 5 3]
DLM Lock Space: "Magma" 4 5 run -
[1 2 4 5]
DLM Lock Space: "gfs1" 5 6 run -
[2]
GFS Mount Group: "gfs1" 6 7 run -
[2]
User: "usrm::manager" 3 4 run -
[1 2 4 5]
root@bamf04:~
(0)>cman_tool services
Service Name GID LID State Code
Fence Domain: "default" 1 2 run -
[1 2 4 5 3]
DLM Lock Space: "clvmd" 2 3 update U-4,1,3
[1 2 4 5 3]
DLM Lock Space: "Magma" 4 5 run -
[1 2 4 5]
User: "usrm::manager" 3 4 run -
[1 2 4 5]
An strace of the two running clurgmgrd processes on an OK node shows this:
root@bamf05:~
(1)>ps auxw |grep clurg |grep -v grep
root 7988 0.0 0.0 9568 376 ? S<s Dec08 0:00 clurgmgrd
root 7989 0.0 0.0 58864 5012 ? S<l Dec08 0:35 clurgmgrd
root@bamf05:~
(0)>strace -p 7988
Process 7988 attached - interrupt to quit
wait4(7989,
[nothing]
root@bamf05:~
(0)>strace -p 7989
Process 7989 attached - interrupt to quit
select(7, [4 5 6], NULL, NULL, {7, 760000}) = 0 (Timeout)
socket(PF_FILE, SOCK_STREAM, 0) = 12
connect(12, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(12, "\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 20) = 20
read(12, "\1\0\0\0\0\0\0\0|o_\0\0\0\0\0\0\0\0\0", 20) = 20
close(12) = 0
[snip]
strace of clurgmgrd PIDs on failed node shows:
root@bamf01:~
(0)>ps auxw |grep clurg |grep -v grep
root 7982 0.0 0.0 9568 376 ? S<s Dec08 0:00 clurgmgrd
root 7983 0.0 0.0 61592 7220 ? S<l Dec08 1:03 clurgmgrd
root@bamf01:~
(0)>strace -p 7982
Process 7982 attached - interrupt to quit
wait4(7983,
[nothing]
root@bamf01:~
(0)>strace -p 7983
Process 7983 attached - interrupt to quit
futex(0x522e28, FUTEX_WAIT, 5, NULL
[nothing]
Abe
-- Linux-cluster mailing list Linux-cluster@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/linux-cluster