Troubles with AFR - core dump

Tom Myny - Tigron BVBA <tom.myny@xxxxxxxxx> · Mon, 24 Mar 2008 21:12:44 +0100

Hello all,

I was first using a fresh debian (latest stable - etch) with the  
debian packages that are available here ( deb  
http://lmello.virt-br.org/debian ./ ...)

But when using them a noticed a crash on one of my servers.

So i noticed that the version that i was using was glusterfs 1.3.8pre1.

After an upgrade to glusterfs 1.3.8pre3 i've got another core dump so  
i'm out of ideas for the moment :)

My current (test) set-up

Two server's and a client running the latest debian stable using  
glusterfs 1.3.8pre1.

The kernels:

Linux tweety 2.6.18-6-amd64 #1 SMP Sun Feb 10 17:50:19 UTC 2008 x86_64  
GNU/Linux

The filessytems are running xfs.

My Config file on one of the servers:

volume sas-ds
        type storage/posix
        option directory /sas/data
end-volume

volume sas-ns
        type storage/posix
        option directory /sas/ns
end-volume

volume sata-ds
        type storage/posix
        option directory /sata/data
end-volume

volume sata-ns
        type storage/posix
        option directory /sata/ns
end-volume

volume sas-backup-ds
        type protocol/client
        option transport-type tcp/client
        option remote-host 10.6.0.8
        option remote-subvolume sas-ds
end-volume

volume sas-backup-ns
        type protocol/client
        option transport-type tcp/client
        option remote-host 10.6.0.8
        option remote-subvolume sas-ns
end-volume

volume sata-backup-ds
        type protocol/client
        option transport-type tcp/client
        option remote-host 10.6.0.8
        option remote-subvolume sata-ds
end-volume

volume sata-backup-ns
        type protocol/client
        option transport-type tcp/client
        option remote-host 10.6.0.8
        option remote-subvolume sata-ns
end-volume

volume sas-ds-afr
        type cluster/afr
        subvolumes sas-ds sas-backup-ds
end-volume

volume sas-ns-afr
        type cluster/afr
        subvolumes sas-ns sas-backup-ns
end-volume

volume sata-ds-afr
        type cluster/afr
        subvolumes sata-ds sata-backup-ds
end-volume

volume sata-ns-afr
        type cluster/afr
        subvolumes sata-ns sata-backup-ns
end-volume

volume sas-unify
        type cluster/unify
        subvolumes sas-ds-afr
        option namespace sas-ns-afr
        option scheduler rr
end-volume

volume sata-unify
        type cluster/unify
        subvolumes sata-ds-afr
        option namespace sata-ns-afr
        option scheduler rr
end-volume

volume sas
        type performance/io-threads
        option thread-count 8
        option cache-size 64MB
        subvolumes sas-unify
end-volume

volume sata
        type performance/io-threads
        option thread-count 8
        option cache-size 64MB
        subvolumes sata-unify
end-volume

volume server
     type protocol/server
     option transport-type tcp/server
     subvolumes sas sata
     option auth.ip.sas-ds.allow 10.6.0.8,127.0.0.1
     option auth.ip.sas-ns.allow 10.6.0.8,127.0.0.1
     option auth.ip.sata-ds.allow 10.6.0.8,127.0.0.1
     option auth.ip.sata-ns.allow 10.6.0.8,127.0.0.1
     option auth.ip.sas.allow *
     option auth.ip.sata.allow *
end-volume

The client is running with the following config file:

volume sas
     type protocol/client
     option transport-type tcp/client     # for TCP/IP transport
     option remote-host 10.6.0.10
     option remote-subvolume sas        # name of the remote volume
end-volume

volume sata
     type protocol/client
     option transport-type tcp/client     # for TCP/IP transport
     option remote-host 10.6.0.10
     option remote-subvolume sata        # name of the remote volume
end-volume

After a wile, when the client is copying data to my server 10.6.0.10  
(and this server is replycating with afr nicely to 10.6.0.8) a get the  
following message:

TLA Repo Revision: glusterfs--mainline--2.5--patch-704
Time : 2008-03-24 18:36:09
Signal Number : 11

glusterfsd -l /var/log/glusterfs/glusterfsd.log -L WARNING
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)

TLA Repo Revision: glusterfs--mainline--2.5--patch-704
Time : 2008-03-24 18:36:09
Signal Number : 11

glusterfsd -l /var/log/glusterfs/glusterfsd.log -L WARNING
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)
frame : type(0) op(0)

/lib/libc.so.6[0x2af9d5c48110]
/lib/libc.so.6[0x2af9d5c48110]
/lib/libc.so.6(strlen+0x30)[0x2af9d5c8b5a0]
/lib/libc.so.6(strlen+0x30)[0x2af9d5c8b5a0]
/usr/lib/libglusterfs.so.0[0x2af9d58f478e]
/usr/lib/libglusterfs.so.0[0x2af9d58f478e]
/usr/lib/libglusterfs.so.0(mop_lock_impl+0x4e)[0x2af9d58f4c1e]
/usr/lib/libglusterfs.so.0(mop_lock_impl+0x4e)[0x2af9d58f4c1e]
/usr/lib/glusterfs/1.3.8pre3/xlator/cluster/afr.so(afr_close+0x45f)[0x2aaaaacd1a2f]
/usr/lib/glusterfs/1.3.8pre3/xlator/cluster/afr.so(afr_close+0x45f)[0x2aaaaacd1a2f]
/usr/lib/glusterfs/1.3.8pre3/xlator/cluster/unify.so(unify_close+0x113)[0x2aaaaade5663]
/usr/lib/glusterfs/1.3.8pre3/xlator/cluster/unify.so(unify_close+0x113)[0x2aaaaade5663]
/usr/lib/glusterfs/1.3.8pre3/xlator/performance/io-threads.so[0x2aaaaaef0704]
/usr/lib/glusterfs/1.3.8pre3/xlator/performance/io-threads.so[0x2aaaaaef0704]
/usr/lib/libglusterfs.so.0(call_resume+0x6a)[0x2af9d58f63ca]
/usr/lib/libglusterfs.so.0(call_resume+0x6a)[0x2af9d58f63ca]
/usr/lib/glusterfs/1.3.8pre3/xlator/performance/io-threads.so[0x2aaaaaeefc6b]
/usr/lib/glusterfs/1.3.8pre3/xlator/performance/io-threads.so[0x2aaaaaeefc6b]
/lib/libpthread.so.0[0x2af9d5b09f1a]
/lib/libpthread.so.0[0x2af9d5b09f1a]
/lib/libc.so.6(__clone+0x72)[0x2af9d5ce25d2]
---------
/lib/libc.so.6(__clone+0x72)[0x2af9d5ce25d2]
---------

Doing a gdb on the coredump:

Core was generated by `[glusterfs]                                      
                              '.
Program terminated with signal 11, Segmentation fault.
#0  0x00002af9d5c8b5a0 in strlen () from /lib/libc.so.6
(gdb) backtrace
#0  0x00002af9d5c8b5a0 in strlen () from /lib/libc.so.6
#1  0x00002af9d58f478e in place_lock_after (granted=0x2af9d59ff9c0,  
path=0x2aaaabdbf1e0  
"/sata-ds//img_large/20030106/fred33go/1041867234.jpg/") at lock.c:84
#2  0x00002af9d58f4c1e in mop_lock_impl (frame=0x2aaaabdbf180,  
this_xl=<value optimized out>, path=0x2aaaabdbf330  
"/sata-ds//img_large/20030106/fred33go/1041867234.jpg") at lock.c:118
#3  0x00002aaaaacd1a2f in afr_close (frame=0x2aaaabdb1bb0, this=<value  
optimized out>, fd=0x2aaaabb916c0) at afr.c:3676
#4  0x00002aaaaade5663 in unify_close (frame=0x2aaaabdb1d20,  
this=<value optimized out>, fd=0x2aaaabb916c0) at unify.c:2384
#5  0x00002aaaaaef0704 in iot_close_wrapper (frame=0x2aaaabb71260,  
this=0x50dfb0, fd=0x2aaaabb916c0) at io-threads.c:190
#6  0x00002af9d58f63ca in call_resume (stub=0x0) at call-stub.c:2740
#7  0x00002aaaaaeefc6b in iot_worker (arg=<value optimized out>) at  
io-threads.c:1061
#8  0x00002af9d5b09f1a in start_thread () from /lib/libpthread.so.0
#9  0x00002af9d5ce25d2 in clone () from /lib/libc.so.6
#10 0x0000000000000000 in ?? ()

If someone has an idea, it's very welcome.

Regards,
Tom