Re: quota.t hangs on NetBSD machines

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On Thu, Dec 31, 2015 at 3:24 PM, Emmanuel Dreyfus <manu@xxxxxxxxxx> wrote:
On Thu, Dec 31, 2015 at 02:51:41PM +0530, Raghavendra Talur wrote:
> To our surprise though, the hung test started proceeding.

You mean a process gets stuck into a system call for hours and then
is able to ascape?

Some hints:

1) ps -axl shows the waiting channel (WCHAN column) for a process stuck
in kernel. What is it?

# ps -axl | grep 23268
   0 23268     1     0  85  0  56436 12544 select   Isl  ?        0:02.06 glusterfs --attribute-timeout=0 --entry-timeout=0 -s nbsla
# ps -axl | grep 26515
   0 26515     1     0  85  0   4508  1440 kqueue   S+   pts/0    0:00.04 perfused: perfused /mnt/glusterfs/0 (perfused)

 

2) crash is a kernel debugger that can be used while running multiuser.
Of course since the system is running, the output is obsolete most
of the time, but for a stuck process we can extract valuable information.

Run crash from the shell, then inside crash, run the ps command. Find the
relevant process and note the address in the STRUCT LWP * column. For an
example, let us say it is c63452a0.

relevant lines:

23268    8 3   0        80           c4c9f000          glusterfsd parked
23268    7 3   0        80           c5223a80         glusterfsd netio
23268    6 3   0        80           c542e560         glusterfsd nanoslp
23268    5 3   1        80           c5229a80         glusterfsd parked
23268    4 3   0        80           c5418d40         glusterfsd parked
23268    3 3   0        80           c5346540         glusterfsd sigwait
23268    2 3   0        80           c4ce22c0         glusterfsd nanoslp
23268    1 3   1        80           c5418020         glusterfsd select
26515    1 3   1        80           c53692c0         perfused kqueue

 

bt/a c63452a0 will produce a kernel backtrace for the process. This can
be extremely valuable to understand hat is going on. If we are awaiting
for a lock, we can track what process is holdoing it.

bt/a c4c9f000
trace: pid 23268 lid 8 at 0xdc171e9c
sleepq_block(0,1,c047f728,c049a1bc,6,5000a018,8,dc171f54,c4c9f000,6) at sleepq_block+0x9b
lwp_park(0,1,0,bb1ac150,1de,dc171f54,6,dc171f40,c4c9f000,c0494528) at lwp_park+0x115
sys____lwp_park60(c4c9f000,dc171f54,dc171f7c,dc171fa0,c02eabb7,dc171f54,1de,103,0,1) at sys____lwp_park60+0x50
syscall() at syscall+0x89
--- syscall (number 478) ---
bb3bb4b7:

bt/a c5223a80
trace: pid 23268 lid 7 at 0xdb781d0c
sleepq_block(0,1,c0482887,c0495450,c5223a80,6473,c2d42d40,c2d41dc0,c2d29f82,0) at sleepq_block+0x9b
cv_timedwait_sig(c4dabf2c,c386c3c0,0,c040eb9f,c50a1001,c4eb2f47,65686ee0,c4dabe44,c4dabe44,0) at cv_timedwait_sig+0xaa
sbwait(c4dabf00,0,db781dbc,c01215cc,0,c50a1080,db781dcc,140eb9f,0,c049a760) at sbwait+0x57
soreceive(c4dabe44,0,db781ec8,0,0,0,db781e5c,c040eb9f,2,2) at soreceive+0xc59
soo_read(c4c932c0,c4c932c0,db781ec8,c4c79000,1,c2d29f80,db781e8c,c02552a5,0,0) at soo_read+0x3c
do_filereadv(a,b88fff8c,2,c4c932c0,1,db781f7c,3,c5223a80,c5223a80,c5223a80) at do_filereadv+0x1f0
sys_readv(c5223a80,db781f54,db781f7c,db781fa0,c02eabb7,db781f54,78,db781f7c,a,b88fff8c) at sys_readv+0x38
syscall() at syscall+0x18b
--- syscall (number 120) ---
bb351877:

bt/a c542e560
trace: pid 23268 lid 6 at 0xdc5edddc
sleepq_block(1f5,1,c047363e,c0495dc8,0,3ffff,0,c2d41440,c049cb00,1f5) at sleepq_block+0xea
kpause(c047363e,1,1f5,0,dc5edea4,c4e5df80,ffffffff,ffffffff,c4da9c00,c3458bb0) at kpause+0xe8
nanosleep1(c542e560,3,0,dc5edefc,dc5edf08,9,c38b7360,0,c542e560,c0492efc) at nanosleep1+0xe5
sys___nanosleep50(c542e560,dc5edf54,dc5edf7c,c08eb880,c048ce80,dc5edf54,1ae,b9a0e7c0,b8fff730,b8fff73c) at sys___nanosleep50+0x5f
syscall() at syscall+0x89
--- syscall (number 430) ---
bb351957:


bt/a c5229a80
trace: pid 23268 lid 5 at 0xdba17e9c
sleepq_block(ea60,1,c047f728,c049a1bc,0,64,dba17efc,c0371aeb,0,c5418d40) at sleepq_block+0xea
lwp_park(0,1,dba17f18,ba40d1a4,0,257,0,3acd705f,c5229a80,c0494528) at lwp_park+0x115
sys____lwp_park60(c5229a80,dba17f54,dba17f7c,dba17fa0,c02eab83,dba17f54,1de,103,0,1) at sys____lwp_park60+0x50
syscall() at syscall+0x89
--- syscall (number 478) ---
bb3bb4b7:


 bt/a  c5418d40
trace: pid 23268 lid 4 at 0xdb691e9c
sleepq_block(ea60,1,c047f728,c049a1bc,0,db691fa8,db691eec,c0251425,db691ed4,6) at sleepq_block+0xea
lwp_park(0,1,db691f18,ba40d1a4,0,257,0,3accfb61,c5418d40,c0494528) at lwp_park+0x115
sys____lwp_park60(c5418d40,db691f54,db691f7c,db691fa0,c02eab83,db691f54,1de,103,0,1) at sys____lwp_park60+0x50
syscall() at syscall+0x89
--- syscall (number 478) ---
bb3bb4b7:

 bt/a c5346540
trace: pid 23268 lid 3 at 0xdb727e1c
sleepq_block(0,1,c04716ba,c0495450,c040bc16,0,c2d42d40,c2d41400,c5346540,0) at sleepq_block+0x9b
cv_timedwait_sig(c53466b4,c5004b80,0,c53466a4,3,db727e90,c53466a4,c41eb528,db727eac,7ff0) at cv_timedwait_sig+0xaa
sigtimedwait1(c5346540,db727f54,db727f7c,c01026f0,c01026a0,c01026f0,c01026a0,c06d9000,c02ea954,c3469b10) at sigtimedwait1+0x242
sys_____sigtimedwait50(c5346540,db727f54,db727f7c,db727fa0,c02eab83,db727f54,1af,103,ba1fefbc,0) at sys_____sigtimedwait50+0x3f
syscall() at syscall+0x89
--- syscall (number 431) ---
bb39f8c7:



bt/a c4ce22c0
trace: pid 23268 lid 2 at 0xdcccdddc
sleepq_block(65,1,c047363e,c0495dc8,0,c4eb2f47,e1f297c1,c2d40540,c049c7dc,65) at sleepq_block+0xea
kpause(c047363e,1,65,0,dcccdea4,dcccdec4,dcccdeac,c0251276,c048cce0,1) at kpause+0xe8
nanosleep1(c4ce22c0,3,0,dcccdefc,0,c2d1f548,fffffffd,a5b55001,1b,dcccdfa8) at nanosleep1+0xe5
sys___nanosleep50(c4ce22c0,dcccdf54,dcccdf7c,dcccdfa0,c02eab83,dcccdf54,1ae,103,ba3fff98,0) at sys___nanosleep50+0x5f
syscall() at syscall+0x89
--- syscall (number 430) ---
bb351957:



 bt/a c5418020
trace: pid 23268 lid 1 at 0xdb721d0c
sleepq_block(2,1,c047fc22,c049a1f8,c56aae94,1,ffffffff,c040eb25,c048ce80,0) at sleepq_block+0xea
sel_do_scan(30,db721f18,0,db721f7c,c2d42bc2,c4eb2f47,d5d7bdcf,c02369f7,3,3) at sel_do_scan+0x46e
pollcommon(db721f7c,ba4143c0,6,db721f18,0,db721fa8,db721f2c,db721f40,c040eb9f,0) at pollcommon+0xe7
sys_poll(c5418020,db721f54,db721f7c,db721fa0,c02eabb7,db721f54,d1,103,ba4143c0,6) at sys_poll+0x6a
syscall() at syscall+0x89
--- syscall (number 209) ---
bb351917:

bt/a c53692c0
trace: pid 26515 lid 1 at 0xdbce3d2c
sleepq_block(0,1,c0471305,c0495450,c5066f80,1,c2d42c00,c2d42d80,1,0) at sleepq_block+0x9b
cv_timedwait_sig(c38eff3c,c38eff10,0,c012f509,c048ce80,0,dbce3dbc,c040eb9f,c31a3560,c048ce80) at cv_timedwait_sig+0xaa
kevent1(dbce3f7c,d,bb51f080,0,bb51f080,4,0,c044fbb0,c0492ef0,dbce3fa8) at kevent1+0x45a
sys___kevent50(c53692c0,dbce3f54,dbce3f7c,dbce3fa0,c02eabb7,dbce3f54,1b3,103,d,bb51f080) at sys___kevent50+0x45
syscall() at syscall+0x89
--- syscall (number 435) ---
bb679a77:




 Thanks for the help!



--
Emmanuel Dreyfus
manu@xxxxxxxxxx

_______________________________________________
Gluster-devel mailing list
Gluster-devel@xxxxxxxxxxx
http://www.gluster.org/mailman/listinfo/gluster-devel

[Index of Archives]     [Gluster Users]     [Ceph Users]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Security]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux