Hi John, On 4 September 2010 17:35, John Mahoney <jmahoney@xxxxxxxx> wrote: > On Fri, Sep 3, 2010 at 8:07 PM, Vimal <j.vimal@xxxxxxxxx> wrote: >> Hi all, >> >> We're making some modifications to the scheduler and the kernel >> (2.6.35) just crashes without any error whatsoever. The crash is such >> that the kernel responds to pings for a while; but the mouse doesn't >> work, screen doesn't refresh and we're not able to ssh as well. We >> tried investigating using kgdb, but kgdb doesn't show any >> "exceptions/traps" as well. >> > > Have you checked using http://en.wikipedia.org/wiki/Magic_SysRq_key > when system becomes unresponsive. Thanks for the suggestion. We tried it, but it still didn't respond. :( @Mulyadi Santosa: Thanks for your suggestion! qemu+gdb was really helpful! All the cores were stuck on a spinlock. Attached are the different backtraces on cores that were spinlocked. We have contacted the authors and are waiting for a response. Thanks, -- Vimal
(gdb) info threads [New Thread 4] 6 Thread 4 (CPU#3 [halted ]) 0xffffffff81033ce6 in __ticket_spin_lock (lock=0xffff880001e55740) at /usr/src/linux-2.6.35/arch/x86/include/asm/spinlock.h:65 * 5 Thread 6 (CPU#5 [halted ]) native_safe_halt () at /usr/src/linux-2.6.35/arch/x86/include/asm/irqflags.h:50 4 Thread 3 (CPU#2 [halted ]) 0xffffffff81033ce4 in __ticket_spin_lock (lock=0xffff880001e55740) at /usr/src/linux-2.6.35/arch/x86/include/asm/spinlock.h:65 3 Thread 5 (CPU#4 [halted ]) 0xffffffff81033ce6 in __ticket_spin_lock (lock=0xffff880001e55740) at /usr/src/linux-2.6.35/arch/x86/include/asm/spinlock.h:65 2 Thread 2 (CPU#1 [halted ]) 0xffffffff81033ce4 in __ticket_spin_lock (lock=0xffff880001e55740) at /usr/src/linux-2.6.35/arch/x86/include/asm/spinlock.h:65 1 Thread 1 (CPU#0 [running]) 0xffffffff81033ce4 in __ticket_spin_lock (lock=0xffff880001e55740) at /usr/src/linux-2.6.35/arch/x86/include/asm/spinlock.h:65 (gdb) thread 6 [Switching to thread 6 (Thread 4)]#0 0xffffffff81033ce6 in __ticket_spin_lock (lock=0xffff880001e55740) at /usr/src/linux-2.6.35/arch/x86/include/asm/spinlock.h:65 65 asm volatile ( (gdb) bt #0 0xffffffff81033ce6 in __ticket_spin_lock (lock=0xffff880001e55740) at /usr/src/linux-2.6.35/arch/x86/include/asm/spinlock.h:65 #1 0xffffffff81033d69 in arch_spin_lock (lock=0xffff880001e55740, flags=130) at /usr/src/linux-2.6.35/arch/x86/include/asm/paravirt.h:738 #2 default_spin_lock_flags (lock=0xffff880001e55740, flags=130) at arch/x86/kernel/paravirt-spinlocks.c:13 #3 0xffffffff8154684f in arch_spin_lock_flags (lock=0xffff880001e55740) at /usr/src/linux-2.6.35/arch/x86/include/asm/paravirt.h:744 #4 do_raw_spin_lock_flags (lock=0xffff880001e55740) at include/linux/spinlock.h:146 #5 __raw_spin_lock_irqsave (lock=0xffff880001e55740) at include/linux/spinlock_api_smp.h:119 #6 _raw_spin_lock_irqsave (lock=0xffff880001e55740) at kernel/spinlock.c:145 #7 0xffffffff81055620 in update_group_shares_cpu (tg=0xffff88017ef30000, data=<value optimized out>) at kernel/sched.c:1659 #8 tg_shares_up (tg=0xffff88017ef30000, data=<value optimized out>) at kernel/sched.c:1722 #9 0xffffffff8104869a in walk_tg_tree (down=0xffffffff81044f80 <tg_nop>, up=0xffffffff81055410 <tg_shares_up>, data=0xffff880001ed0420) at kernel/sched.c:1541 #10 0xffffffff810490c2 in update_shares (sd=0xffff880001ed0420) at kernel/sched.c:1765 #11 0xffffffff8104fc00 in select_task_rq_fair (rq=<value optimized out>, p=<value optimized out>, sd_flag=<value optimized out>, wake_flags=<value optimized out>) at kernel/sch #12 0xffffffff8104f1cc in select_task_rq (p=0xffff880194788000, state=1, wake_flags=1) at kernel/sched.c:2385 #13 try_to_wake_up (p=0xffff880194788000, state=1, wake_flags=1) at kernel/sched.c:2469 #14 0xffffffff8104f492 in default_wake_function (curr=<value optimized out>, mode=130, wake_flags=4, key=0x1) at kernel/sched.c:3928 #15 0xffffffff8114ed96 in __pollwake (wait=<value optimized out>, mode=130, sync=4, key=0x1) at fs/select.c:202 #16 pollwake (wait=<value optimized out>, mode=130, sync=4, key=0x1) at fs/select.c:212 #17 0xffffffff81045769 in __wake_up_common (q=<value optimized out>, mode=<value optimized out>, nr_exclusive=<value optimized out>, wake_flags=1, key=0xc1) at kernel/sched.c:3 #18 0xffffffff81048d33 in __wake_up_sync_key (q=0xffff8801944d6a40, mode=1, nr_exclusive=1, key=0xc1) at kernel/sched.c:4020 #19 0xffffffff8144c60e in sock_def_readable (sk=0xffff8801988dda00, len=<value optimized out>) at net/core/sock.c:1857 #20 0xffffffff814e4b6a in unix_dgram_sendmsg (kiocb=<value optimized out>, sock=<value optimized out>, msg=<value optimized out>, len=<value optimized out>) at net/unix/af_unix #21 0xffffffff81447cd3 in __sock_sendmsg (sock=0xffff88019323ef80, msg=0xffff88018f0fde58, size=97) at net/socket.c:573 #22 sock_sendmsg (sock=0xffff88019323ef80, msg=0xffff88018f0fde58, size=97) at net/socket.c:584 #23 0xffffffff81447e85 in sys_sendto (fd=<value optimized out>, buff=0x24976e0, len=97, flags=16384, addr=0x0, addr_len=0) at net/socket.c:1677 #24 0xffffffff8100a032 in ?? () at arch/x86/kernel/entry_64.S:487 #25 0x00007f1bc56eb62c in ?? () #26 0xffff88018f233480 in ?? () #27 0x00007f0b71d0b000 in ?? () #28 0x00007f0b71d12000 in ?? () #29 0xffff88018f0fe370 in ?? () #30 0x0000000000000025 in ?? () #31 0x0000000008000075 in ?? () #32 0xffff88018f1d1871 in ?? () #33 0x0000000000000000 in ?? () (gdb) thread 4 [Switching to thread 4 (Thread 3)]#0 0xffffffff81033ce4 in __ticket_spin_lock (lock=0xffff880001e55740) at /usr/src/linux-2.6.35/arch/x86/include/asm/spinlock.h:65 65 asm volatile ( (gdb) bt #0 0xffffffff81033ce4 in __ticket_spin_lock (lock=0xffff880001e55740) at /usr/src/linux-2.6.35/arch/x86/include/asm/spinlock.h:65 #1 0xffffffff81033d69 in arch_spin_lock (lock=0xffff880001e55740, flags=130) at /usr/src/linux-2.6.35/arch/x86/include/asm/paravirt.h:738 #2 default_spin_lock_flags (lock=0xffff880001e55740, flags=130) at arch/x86/kernel/paravirt-spinlocks.c:13 #3 0xffffffff8154684f in arch_spin_lock_flags (lock=0xffff880001e55740) at /usr/src/linux-2.6.35/arch/x86/include/asm/paravirt.h:744 #4 do_raw_spin_lock_flags (lock=0xffff880001e55740) at include/linux/spinlock.h:146 #5 __raw_spin_lock_irqsave (lock=0xffff880001e55740) at include/linux/spinlock_api_smp.h:119 #6 _raw_spin_lock_irqsave (lock=0xffff880001e55740) at kernel/spinlock.c:145 #7 0xffffffff81055620 in update_group_shares_cpu (tg=0xffff88017ef30000, data=<value optimized out>) at kernel/sched.c:1659 #8 tg_shares_up (tg=0xffff88017ef30000, data=<value optimized out>) at kernel/sched.c:1722 #9 0xffffffff8104869a in walk_tg_tree (down=0xffffffff81044f80 <tg_nop>, up=0xffffffff81055410 <tg_shares_up>, data=0xffff880001e90420) at kernel/sched.c:1541 #10 0xffffffff810490c2 in update_shares (sd=0xffff880001e90420) at kernel/sched.c:1765 #11 0xffffffff81051b28 in load_balance (this_cpu=32048160, this_rq=0x82, sd=<value optimized out>, idle=CPU_NOT_IDLE, balance=0x400) at kernel/sched_fair.c:3077 #12 0xffffffff8154433f in idle_balance () at kernel/sched_fair.c:3245 #13 schedule () at kernel/sched.c:3766 #14 0xffffffff81546e76 in ?? () at arch/x86/kernel/entry_64.S:908 #15 0xffffffffffffff10 in ?? () Backtrace stopped: previous frame inner to this frame (corrupt stack?) (gdb) thread 2 [Switching to thread 2 (Thread 2)]#0 0xffffffff81033ce4 in __ticket_spin_lock (lock=0xffff880001e55740) at /usr/src/linux-2.6.35/arch/x86/include/asm/spinlock.h:65 65 asm volatile ( (gdb) bt #0 0xffffffff81033ce4 in __ticket_spin_lock (lock=0xffff880001e55740) at /usr/src/linux-2.6.35/arch/x86/include/asm/spinlock.h:65 #1 0xffffffff81033d69 in arch_spin_lock (lock=0xffff880001e55740, flags=134) at /usr/src/linux-2.6.35/arch/x86/include/asm/paravirt.h:738 #2 default_spin_lock_flags (lock=0xffff880001e55740, flags=134) at arch/x86/kernel/paravirt-spinlocks.c:13 #3 0xffffffff8154684f in arch_spin_lock_flags (lock=0xffff880001e55740) at /usr/src/linux-2.6.35/arch/x86/include/asm/paravirt.h:744 #4 do_raw_spin_lock_flags (lock=0xffff880001e55740) at include/linux/spinlock.h:146 #5 __raw_spin_lock_irqsave (lock=0xffff880001e55740) at include/linux/spinlock_api_smp.h:119 #6 _raw_spin_lock_irqsave (lock=0xffff880001e55740) at kernel/spinlock.c:145 #7 0xffffffff81055620 in update_group_shares_cpu (tg=0xffff88017ef30000, data=<value optimized out>) at kernel/sched.c:1659 #8 tg_shares_up (tg=0xffff88017ef30000, data=<value optimized out>) at kernel/sched.c:1722 #9 0xffffffff8104869a in walk_tg_tree (down=0xffffffff81044f80 <tg_nop>, up=0xffffffff81055410 <tg_shares_up>, data=0xffff880001e50420) at kernel/sched.c:1541 #10 0xffffffff810490c2 in update_shares (sd=0xffff880001e50420) at kernel/sched.c:1765 #11 0xffffffff8104fc00 in select_task_rq_fair (rq=<value optimized out>, p=<value optimized out>, sd_flag=<value optimized out>, wake_flags=<value optimized out>) at kernel/sch #12 0xffffffff8104f1cc in select_task_rq (p=0xffff88019478db40, state=1, wake_flags=0) at kernel/sched.c:2385 #13 try_to_wake_up (p=0xffff88019478db40, state=1, wake_flags=0) at kernel/sched.c:2469 #14 0xffffffff8104f492 in default_wake_function (curr=<value optimized out>, mode=134, wake_flags=4, key=0x1) at kernel/sched.c:3928 #15 0xffffffff8107bac6 in autoremove_wake_function (wait=0xffff880001e55740, mode=134, sync=4, key=0x1) at kernel/wait.c:165 #16 0xffffffff81045769 in __wake_up_common (q=<value optimized out>, mode=<value optimized out>, nr_exclusive=<value optimized out>, wake_flags=0, key=0x0) at kernel/sched.c:39 #17 0xffffffff81048dc8 in __wake_up (q=0xffffffff81a3b3a0, mode=1, nr_exclusive=1, key=0x0) at kernel/sched.c:3971 #18 0xffffffff8105ddd7 in printk_tick () at kernel/printk.c:1031 #19 0xffffffff8106cc1d in update_process_times (user_tick=0) at kernel/timer.c:1266 #20 0xffffffff8108af31 in tick_sched_timer (timer=0xffff880001e50a60) at kernel/time/tick-sched.c:767 #21 0xffffffff8107f62f in __run_hrtimer (timer=0xffff880001e50a60, now=0xffff880001e43f48) at kernel/hrtimer.c:1227 #22 0xffffffff8107f9d6 in hrtimer_interrupt (dev=<value optimized out>) at kernel/hrtimer.c:1310 #23 0xffffffff8154deab in local_apic_timer_interrupt (regs=<value optimized out>) at arch/x86/kernel/apic/apic.c:799 #24 smp_apic_timer_interrupt (regs=<value optimized out>) at arch/x86/kernel/apic/apic.c:826 #25 0xffffffff8100a9d3 in ?? () at arch/x86/kernel/entry_64.S:978 #26 0x00cf9b000000ffff in ?? () Backtrace stopped: previous frame inner to this frame (corrupt stack?)