[Bug 202237] New: softlockup in vmx_vcpu_run

bugzilla-daemon@xxxxxxxxxxxxxxxxxxx · Sat, 12 Jan 2019 01:30:27 +0000

https://bugzilla.kernel.org/show_bug.cgi?id=202237

            Bug ID: 202237
           Summary: softlockup in vmx_vcpu_run
           Product: Virtualization
           Version: unspecified
    Kernel Version: 4.1.0
          Hardware: Intel
                OS: Linux
              Tree: Mainline
            Status: NEW
          Severity: high
          Priority: P1
         Component: kvm
          Assignee: virtualization_kvm@xxxxxxxxxxxxxxxxxxxx
          Reporter: lw8186@xxxxxx
        Regression: No

During the running of the linux with kernel 4.1.0, softlockup appears, and
after a few minutes, the system recovers. At the same time, At this time we
were able to observe that the qemu process hung lived for a few minutes. The
stack of softlockup is as follows（in vmcore after recurrence）:
PID: 6618   TASK: ffff883feb79b250  CPU: 2   COMMAND: "CPU 3/KVM"
 #0 [ffff881fff883c90] machine_kexec at ffffffff8105b311
 #1 [ffff881fff883d00] crash_kexec at ffffffff8110c358
 #2 [ffff881fff883dd0] panic at ffffffff817eb056
 #3 [ffff881fff883e50] watchdog_timer_fn at ffffffff81138509
 #4 [ffff881fff883e90] __run_hrtimer at ffffffff810ec4e5
 #5 [ffff881fff883ef0] hrtimer_interrupt at ffffffff810eca83
 #6 [ffff881fff883f70] local_apic_timer_interrupt at ffffffff81052599
 #7 [ffff881fff883f90] smp_apic_timer_interrupt at ffffffff817fafe1
 #8 [ffff881fff883fb0] apic_timer_interrupt at ffffffff817f903e
--- <IRQ stack> ---
 #9 [ffff883fc1833a08] apic_timer_interrupt at ffffffff817f903e
    [exception RIP: smp_call_function_single+205]
    RIP: ffffffff8110125d  RSP: ffff883fc1833ab8  RFLAGS: 00000202
    RAX: 0000000000000000  RBX: 000000000000001a  RCX: 0000000000000003
    RDX: 0000000000000003  RSI: 0000000000000000  RDI: 0000000000000292
    RBP: ffff883fc1833af8   R8: 0000000000000000   R9: 0000000000000004
    R10: 0000000000000004  R11: 0000000000000000  R12: 0000000000000002
    R13: 00000100000000fb  R14: 000000000000e240  R15: 00000002c1833a68
    ORIG_RAX: ffffffffffffff10  CS: 0010  SS: 0018
#10 [ffff883fc1833b00] vmx_vcpu_load at ffffffffc0395485 [kvm_intel]
#11 [ffff883fc1833b40] kvm_arch_vcpu_load at ffffffffc04b5ab3 [kvm]
#12 [ffff883fc1833b60] kvm_sched_in at ffffffffc049f532 [kvm]
#13 [ffff883fc1833b80] finish_task_switch at ffffffff810a52b0
#14 [ffff883fc1833bc0] __schedule at ffffffff817f3b29
#15 [ffff883fc1833c20] schedule at ffffffff817f4287
#16 [ffff883fc1833c40] kvm_vcpu_block at ffffffffc04a045b [kvm]
#17 [ffff883fc1833cb0] vcpu_run at ffffffffc04bb2c7 [kvm]
#18 [ffff883fc1833d80] kvm_arch_vcpu_ioctl_run at ffffffffc04bc455 [kvm]
#19 [ffff883fc1833dc0] kvm_vcpu_ioctl at ffffffffc04a2ab5 [kvm]
#20 [ffff883fc1833e80] do_vfs_ioctl at ffffffff81215f16
#21 [ffff883fc1833f00] sys_ioctl at ffffffff81216451
#22 [ffff883fc1833f50] system_call_fastpath at ffffffff817f81b2
    RIP: 00007fb833444a57  RSP: 00007fb8293a79f8  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 000055e7fee9db50  RCX: 00007fb833444a57
    RDX: 0000000000000000  RSI: 000000000000ae80  RDI: 0000000000000016
    RBP: 00000000c008ae67   R8: 000055e7fd4c52f0   R9: 0000000000000000
    R10: 0000000000000000  R11: 0000000000000246  R12: 00007fb8293a7820
    R13: 0000000000000002  R14: 0000000000000003  R15: 00000000000001f0
    ORIG_RAX: 0000000000000010  CS: 0033  SS: 002b

Through vmcore, you can get the communication cpu as（smp_call_function_single）:
   struct kvm_vcpu ffff883fc1838000
   crash> struct vcpu_vmx.loaded_vmcs ffff883fc1838000
        loaded_vmcs = 0xffff883fc183bb20
   crash> struct loaded_vmcs.cpu 0xffff883fc183bb20
        cpu = 26

There were some call_single_data(from IPI) on the CPU26.
   crash> p call_single_queue
        [26]: ffff881fffb984c0
        crash> struct llist_head ffff881fffb984c0
                struct llist_head {
                        first = 0xffff881f9bb6fac8
                }
        crash> list 0xffff881f9bb6fac8
                        ffff881f9bb6fac8
                        ffff883fcfc1fac8
                        ffff883fc1807ac8
                        ffff883fd2afbac8
                        ffff883fcc7bbac8
                        ffff881fa51f7ac8
                        ffff881fa18b3ac8
                        ffff883e7582bac8
                        ffff881faa86bac8
                        ffff881fa6cc3ac8
                        ffff881f9db23ac8
                        ffff883fbbbffac8
                        ffff883fed807ac8
                        ffff881fb30efac8
                        ffff881fed4f3ac8
                        ffff883fc1833ac8  ======= This is frome cpu2 

The curr task is：
    crash> bt 6178
PID: 6178   TASK: ffff883fee801420  CPU: 26  COMMAND: "CPU 1/KVM"
 #0 [ffff881fffb86e00] crash_nmi_callback at ffffffff8104f2d8
 #1 [ffff881fffb86e10] nmi_handle at ffffffff8101af87
 #2 [ffff881fffb86e90] default_do_nmi at ffffffff8101b7cd
 #3 [ffff881fffb86ec0] do_nmi at ffffffff8101b9b5
 #4 [ffff881fffb86ef0] end_repeat_nmi at ffffffff817fa505
    [exception RIP: vmx_vcpu_run+1699]
    RIP: ffffffffc0396883  RSP: ffff883fcffdfc48  RFLAGS: 00000046
    RAX: 0000000080000202  RBX: 0000000080000200  RCX: ffff883fcca80000
    RDX: 0000000000004404  RSI: 00000000f772f120  RDI: ffff883fcca80000
    RBP: ffff883fcffdfca8   R8: 0000000000000000   R9: 0000000000000000
    R10: 0000000000000000  R11: 0000000000000000  R12: 0000000000000000
    R13: 0000000000000000  R14: 0000000000000000  R15: 0000000000000000
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
--- <NMI exception stack> ---
 #5 [ffff883fcffdfc48] vmx_vcpu_run at ffffffffc0396883 [kvm_intel]
 #6 [ffff883fcffdfc78] vmx_handle_external_intr at ffffffffc03906ac [kvm_intel]
 #7 [ffff883fcffdfcb0] vcpu_run at ffffffffc04bbb5b [kvm]
 #8 [ffff883fcffdfd80] kvm_arch_vcpu_ioctl_run at ffffffffc04bc455 [kvm]
 #9 [ffff883fcffdfdc0] kvm_vcpu_ioctl at ffffffffc04a2ab5 [kvm]
#10 [ffff883fcffdfe80] do_vfs_ioctl at ffffffff81215f16
#11 [ffff883fcffdff00] sys_ioctl at ffffffff81216451
#12 [ffff883fcffdff50] system_call_fastpath at ffffffff817f81b2
    RIP: 00007f6d4af16a57  RSP: 00007f6d41e7b9f8  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 000055cf6e3b9b50  RCX: 00007f6d4af16a57
    RDX: 0000000000000000  RSI: 000000000000ae80  RDI: 0000000000000014
    RBP: 00000000c008ae67   R8: 000055cf6d5cb2f0   R9: 27ec1e2200000000
    R10: 0000000000000000  R11: 0000000000000246  R12: 00007f6d41e7b7f0
    R13: 0000000000000002  R14: 0000000000000001  R15: 0000000000000600
    ORIG_RAX: 0000000000000010  CS: 0033  SS: 002b

In vcpu_run, we can know vmx_vcpu_run is called with irq_disable, Therefore it
cannot respond normally to IPI. 

Is there some problems with my understanding？

-- 
You are receiving this mail because:
You are watching the assignee of the bug.