----- Original Message ----- > From: "Raghavendra Talur" <rtalur@xxxxxxxxxx> > To: "Emmanuel Dreyfus" <manu@xxxxxxxxxx> > Cc: "Gluster Devel" <gluster-devel@xxxxxxxxxxx> > Sent: Thursday, December 31, 2015 3:40:54 PM > Subject: Re: quota.t hangs on NetBSD machines > > > > On Thu, Dec 31, 2015 at 3:24 PM, Emmanuel Dreyfus < manu@xxxxxxxxxx > wrote: > > > On Thu, Dec 31, 2015 at 02:51:41PM +0530, Raghavendra Talur wrote: > > To our surprise though, the hung test started proceeding. > > You mean a process gets stuck into a system call for hours and then > is able to ascape? > > Some hints: > > 1) ps -axl shows the waiting channel (WCHAN column) for a process stuck > in kernel. What is it? > > # ps -axl | grep 23268 > 0 23268 1 0 85 0 56436 12544 select Isl ? 0:02.06 glusterfs > --attribute-timeout=0 --entry-timeout=0 -s nbsla > # ps -axl | grep 26515 > 0 26515 1 0 85 0 4508 1440 kqueue S+ pts/0 0:00.04 perfused: perfused > /mnt/glusterfs/0 (perfused) > > > > > 2) crash is a kernel debugger that can be used while running multiuser. > Of course since the system is running, the output is obsolete most > of the time, but for a stuck process we can extract valuable information. > > Run crash from the shell, then inside crash, run the ps command. Find the > relevant process and note the address in the STRUCT LWP * column. For an > example, let us say it is c63452a0. > > relevant lines: > > 23268 8 3 0 80 c4c9f000 glusterfsd parked > 23268 7 3 0 80 c5223a80 glusterfsd netio > 23268 6 3 0 80 c542e560 glusterfsd nanoslp > 23268 5 3 1 80 c5229a80 glusterfsd parked > 23268 4 3 0 80 c5418d40 glusterfsd parked > 23268 3 3 0 80 c5346540 glusterfsd sigwait > 23268 2 3 0 80 c4ce22c0 glusterfsd nanoslp > 23268 1 3 1 80 c5418020 glusterfsd select > 26515 1 3 1 80 c53692c0 perfused kqueue > > > > > bt/a c63452a0 will produce a kernel backtrace for the process. This can > be extremely valuable to understand hat is going on. If we are awaiting > for a lock, we can track what process is holdoing it. > > bt/a c4c9f000 > trace: pid 23268 lid 8 at 0xdc171e9c > sleepq_block(0,1,c047f728,c049a1bc,6,5000a018,8,dc171f54,c4c9f000,6) at > sleepq_block+0x9b > lwp_park(0,1,0,bb1ac150,1de,dc171f54,6,dc171f40,c4c9f000,c0494528) at > lwp_park+0x115 > sys____lwp_park60(c4c9f000,dc171f54,dc171f7c,dc171fa0,c02eabb7,dc171f54,1de,103,0,1) > at sys____lwp_park60+0x50 > syscall() at syscall+0x89 > --- syscall (number 478) --- > bb3bb4b7: > > bt/a c5223a80 > trace: pid 23268 lid 7 at 0xdb781d0c > sleepq_block(0,1,c0482887,c0495450,c5223a80,6473,c2d42d40,c2d41dc0,c2d29f82,0) > at sleepq_block+0x9b > cv_timedwait_sig(c4dabf2c,c386c3c0,0,c040eb9f,c50a1001,c4eb2f47,65686ee0,c4dabe44,c4dabe44,0) > at cv_timedwait_sig+0xaa > sbwait(c4dabf00,0,db781dbc,c01215cc,0,c50a1080,db781dcc,140eb9f,0,c049a760) > at sbwait+0x57 > soreceive(c4dabe44,0,db781ec8,0,0,0,db781e5c,c040eb9f,2,2) at soreceive+0xc59 > soo_read(c4c932c0,c4c932c0,db781ec8,c4c79000,1,c2d29f80,db781e8c,c02552a5,0,0) > at soo_read+0x3c > do_filereadv(a,b88fff8c,2,c4c932c0,1,db781f7c,3,c5223a80,c5223a80,c5223a80) > at do_filereadv+0x1f0 > sys_readv(c5223a80,db781f54,db781f7c,db781fa0,c02eabb7,db781f54,78,db781f7c,a,b88fff8c) > at sys_readv+0x38 > syscall() at syscall+0x18b > --- syscall (number 120) --- Is this hang in read syscall? If its a hung syscall, getting a statedump of gluster client process will help. 1. Add "all=yes" <gluster-install-directory>/var/run/gluster/glusterdump.options 2. kill -SIGUSR1 <gluster-client-pid> 3. statedump can be found in <gluster-install-directory>/var/run/gluster/*dump* > bb351877: > > bt/a c542e560 > trace: pid 23268 lid 6 at 0xdc5edddc > sleepq_block(1f5,1,c047363e,c0495dc8,0,3ffff,0,c2d41440,c049cb00,1f5) at > sleepq_block+0xea > kpause(c047363e,1,1f5,0,dc5edea4,c4e5df80,ffffffff,ffffffff,c4da9c00,c3458bb0) > at kpause+0xe8 > nanosleep1(c542e560,3,0,dc5edefc,dc5edf08,9,c38b7360,0,c542e560,c0492efc) at > nanosleep1+0xe5 > sys___nanosleep50(c542e560,dc5edf54,dc5edf7c,c08eb880,c048ce80,dc5edf54,1ae,b9a0e7c0,b8fff730,b8fff73c) > at sys___nanosleep50+0x5f > syscall() at syscall+0x89 > --- syscall (number 430) --- > bb351957: > > > bt/a c5229a80 > trace: pid 23268 lid 5 at 0xdba17e9c > sleepq_block(ea60,1,c047f728,c049a1bc,0,64,dba17efc,c0371aeb,0,c5418d40) at > sleepq_block+0xea > lwp_park(0,1,dba17f18,ba40d1a4,0,257,0,3acd705f,c5229a80,c0494528) at > lwp_park+0x115 > sys____lwp_park60(c5229a80,dba17f54,dba17f7c,dba17fa0,c02eab83,dba17f54,1de,103,0,1) > at sys____lwp_park60+0x50 > syscall() at syscall+0x89 > --- syscall (number 478) --- > bb3bb4b7: > > > bt/a c5418d40 > trace: pid 23268 lid 4 at 0xdb691e9c > sleepq_block(ea60,1,c047f728,c049a1bc,0,db691fa8,db691eec,c0251425,db691ed4,6) > at sleepq_block+0xea > lwp_park(0,1,db691f18,ba40d1a4,0,257,0,3accfb61,c5418d40,c0494528) at > lwp_park+0x115 > sys____lwp_park60(c5418d40,db691f54,db691f7c,db691fa0,c02eab83,db691f54,1de,103,0,1) > at sys____lwp_park60+0x50 > syscall() at syscall+0x89 > --- syscall (number 478) --- > bb3bb4b7: > > bt/a c5346540 > trace: pid 23268 lid 3 at 0xdb727e1c > sleepq_block(0,1,c04716ba,c0495450,c040bc16,0,c2d42d40,c2d41400,c5346540,0) > at sleepq_block+0x9b > cv_timedwait_sig(c53466b4,c5004b80,0,c53466a4,3,db727e90,c53466a4,c41eb528,db727eac,7ff0) > at cv_timedwait_sig+0xaa > sigtimedwait1(c5346540,db727f54,db727f7c,c01026f0,c01026a0,c01026f0,c01026a0,c06d9000,c02ea954,c3469b10) > at sigtimedwait1+0x242 > sys_____sigtimedwait50(c5346540,db727f54,db727f7c,db727fa0,c02eab83,db727f54,1af,103,ba1fefbc,0) > at sys_____sigtimedwait50+0x3f > syscall() at syscall+0x89 > --- syscall (number 431) --- > bb39f8c7: > > > > bt/a c4ce22c0 > trace: pid 23268 lid 2 at 0xdcccdddc > sleepq_block(65,1,c047363e,c0495dc8,0,c4eb2f47,e1f297c1,c2d40540,c049c7dc,65) > at sleepq_block+0xea > kpause(c047363e,1,65,0,dcccdea4,dcccdec4,dcccdeac,c0251276,c048cce0,1) at > kpause+0xe8 > nanosleep1(c4ce22c0,3,0,dcccdefc,0,c2d1f548,fffffffd,a5b55001,1b,dcccdfa8) at > nanosleep1+0xe5 > sys___nanosleep50(c4ce22c0,dcccdf54,dcccdf7c,dcccdfa0,c02eab83,dcccdf54,1ae,103,ba3fff98,0) > at sys___nanosleep50+0x5f > syscall() at syscall+0x89 > --- syscall (number 430) --- > bb351957: > > > > bt/a c5418020 > trace: pid 23268 lid 1 at 0xdb721d0c > sleepq_block(2,1,c047fc22,c049a1f8,c56aae94,1,ffffffff,c040eb25,c048ce80,0) > at sleepq_block+0xea > sel_do_scan(30,db721f18,0,db721f7c,c2d42bc2,c4eb2f47,d5d7bdcf,c02369f7,3,3) > at sel_do_scan+0x46e > pollcommon(db721f7c,ba4143c0,6,db721f18,0,db721fa8,db721f2c,db721f40,c040eb9f,0) > at pollcommon+0xe7 > sys_poll(c5418020,db721f54,db721f7c,db721fa0,c02eabb7,db721f54,d1,103,ba4143c0,6) > at sys_poll+0x6a > syscall() at syscall+0x89 > --- syscall (number 209) --- > bb351917: > > bt/a c53692c0 > trace: pid 26515 lid 1 at 0xdbce3d2c > sleepq_block(0,1,c0471305,c0495450,c5066f80,1,c2d42c00,c2d42d80,1,0) at > sleepq_block+0x9b > cv_timedwait_sig(c38eff3c,c38eff10,0,c012f509,c048ce80,0,dbce3dbc,c040eb9f,c31a3560,c048ce80) > at cv_timedwait_sig+0xaa > kevent1(dbce3f7c,d,bb51f080,0,bb51f080,4,0,c044fbb0,c0492ef0,dbce3fa8) at > kevent1+0x45a > sys___kevent50(c53692c0,dbce3f54,dbce3f7c,dbce3fa0,c02eabb7,dbce3f54,1b3,103,d,bb51f080) > at sys___kevent50+0x45 > syscall() at syscall+0x89 > --- syscall (number 435) --- > bb679a77: > > > > > Thanks for the help! > > > > > > -- > Emmanuel Dreyfus > manu@xxxxxxxxxx > > > _______________________________________________ > Gluster-devel mailing list > Gluster-devel@xxxxxxxxxxx > http://www.gluster.org/mailman/listinfo/gluster-devel _______________________________________________ Gluster-devel mailing list Gluster-devel@xxxxxxxxxxx http://www.gluster.org/mailman/listinfo/gluster-devel