2.5 patch 261 crashing with large rsync

Dale Dude <dale@xxxxxxxxxxxxxxx> · Mon, 02 Jul 2007 22:57:24 -0400

I have 3 large rsyncs that run in parallel that check 5tb of files. It 
crashes towards the end. During this glusterfs is taking 600megz of memory.

p.s. if you guys want 3 remote machines to work on let me know.

gdb bt:
#0  0xb76490cd in unify_sh_readdir_cbk (frame=0xd5e2de8, 
cookie=0x8057e08, this=0x80580b8, op_ret=-1, op_errno=107, entry=0x0, 
count=0) at unify-self-heal.c:218
#1  0xb7652ec4 in client_readdir_cbk (frame=0xd50d0f0, args=0xe16e860) 
at client-protocol.c:2816
#2  0xb7655e65 in client_protocol_cleanup (trans=0x8090200) at 
client-protocol.c:4089
#3  0xb7656bd7 in notify (this=0x8057e08, event=3, data=0x8090200) at 
client-protocol.c:4515
#4  0xb7fca4a7 in transport_notify (this=0x8090200, event=17) at 
transport.c:152
#5  0xb7fcab8c in epoll_notify (eevent=17, data=0x8090200) at epoll.c:54
#6  0xb7fcae74 in sys_epoll_iteration (ctx=0xbffedd0c) at epoll.c:146
#7  0xb7fca69a in poll_iteration (ctx=0xbffedd0c) at transport.c:260
#8  0x0804a6de in main (argc=3, argv=0xbffeddd4) at glusterfs.c:341

=========================
glusterfs.log
2007-07-02 19:01:29 C [client-protocol.c:215:call_bail] server1: bailing 
transport
2007-07-02 19:01:29 E [client-protocol.c:328:client_protocol_xfer] 
server1: transport_submit failed
2007-07-02 19:01:29 C [client-protocol.c:215:call_bail] server1vol1: 
bailing transport
2007-07-02 19:01:29 C [tcp.c:81:tcp_disconnect] server1: connection 
disconnected
2007-07-02 19:01:29 E [client-protocol.c:328:client_protocol_xfer] 
server1vol1: transport_submit failed
2007-07-02 19:01:29 C [common-utils.c:205:gf_print_trace] 
debug-backtrace: Got signal (11), printing backtrace
2007-07-02 19:01:29 C [common-utils.c:207:gf_print_trace] 
debug-backtrace: /lib/libglusterfs.so.0(gf_print_trace+0x2b) [0xb7fc8a8a]
2007-07-02 19:01:29 C [common-utils.c:207:gf_print_trace] 
debug-backtrace: [0xffffe420]
2007-07-02 19:01:29 C [common-utils.c:207:gf_print_trace] 
debug-backtrace: /lib/glusterfs/1.3.0-pre5/xlator/protocol/client.so 
[0xb7652ec4]
2007-07-02 19:01:29 C [common-utils.c:207:gf_print_trace] 
debug-backtrace: /lib/glusterfs/1.3.0-pre5/xlator/protocol/client.so 
[0xb7655e65]
2007-07-02 19:01:29 C [common-utils.c:207:gf_print_trace] 
debug-backtrace: 
/lib/glusterfs/1.3.0-pre5/xlator/protocol/client.so(notify+0x131) 
[0xb7656bd7]
2007-07-02 19:01:29 C [common-utils.c:207:gf_print_trace] 
debug-backtrace: /lib/libglusterfs.so.0(transport_notify+0x6a) [0xb7fca4a7]
2007-07-02 19:01:29 C [common-utils.c:207:gf_print_trace] 
debug-backtrace: /lib/libglusterfs.so.0 [0xb7fcab8c]
2007-07-02 19:01:29 C [common-utils.c:207:gf_print_trace] 
debug-backtrace: /lib/libglusterfs.so.0(sys_epoll_iteration+0x14b) 
[0xb7fcae74]
2007-07-02 19:01:29 C [common-utils.c:207:gf_print_trace] 
debug-backtrace: /lib/libglusterfs.so.0(poll_iteration+0x1d) [0xb7fca69a]
2007-07-02 19:01:29 C [common-utils.c:207:gf_print_trace] 
debug-backtrace: [glusterfs] [0x804a6de]
2007-07-02 19:01:29 C [common-utils.c:207:gf_print_trace] 
debug-backtrace: /lib/tls/i686/cmov/libc.so.6(__libc_start_main+0xd2) 
[0xb7e78ea2]
2007-07-02 19:01:29 C [common-utils.c:207:gf_print_trace] 
debug-backtrace: [glusterfs] [0x8049cdd]

=========================
server.vol:
volume volume1
 type storage/posix
 option directory /volume1
end-volume

volume clusterfs1
  type performance/io-threads
  option thread-count 10
  subvolumes volume1
end-volume

#######

volume volumenamespace
 type storage/posix
 option directory /volume.namespace
end-volume

###

volume clusterfs
 type protocol/server
 option transport-type tcp/server
 subvolumes clusterfs1 volumenamespace
 option auth.ip.clusterfs1.allow *
 option auth.ip.volumenamespace.allow *
end-volume

=========================
client.vol
volume server1
        type protocol/client
        option transport-type tcp/client     # for TCP/IP transport
        option remote-host 127.0.0.1     # IP address of the remote brick
        option remote-subvolume volumenamespace
end-volume

volume server1vol1
        type protocol/client
        option transport-type tcp/client     # for TCP/IP transport
        option remote-host 127.0.0.1     # IP address of the remote brick
        option remote-subvolume clusterfs1
end-volume

#volume server1vol1
  #type performance/io-threads
  #option thread-count 10
  #subvolumes server1vol1spec
#end-volume

###################

volume bricks
 type cluster/unify
 option namespace server1
 option readdir-force-success on  # ignore failed mounts
 subvolumes server1vol1

 option scheduler rr
 option rr.limits.min-free-disk 5 #%
end-volume

volume writebehind   #writebehind improves write performance a lot
 type performance/write-behind
 option aggregate-size 131072 # in bytes
 subvolumes bricks
end-volume

volume readahead
 type performance/read-ahead
 option page-size 65536     # unit in bytes
 option page-count 16       # cache per file  = (page-count x page-size)
 subvolumes writebehind
end-volume