Hi all, I noticed a xhci warning on Stratus fault-tolerant box running automated surprise device removal tests over the weekend: irq event 95: bogus return value ffffff94 CPU: 0 PID: 31710 Comm: kworker/u97:2 Tainted: PF O-------------- 3.10.0-229.el7.x86_64 #1 Hardware name: Stratus ftServer 6800/G7LYY, BIOS BIOS Version 8.0:38 03/30/2015 Workqueue: events_unbound async_run_entry_fn ffff88037dd8ada0 000000009c52d75c ffff88085fa03cd8 ffffffff81603f36 ffff88085fa03d00 ffffffff8110d852 ffff88037dd8ada0 000000000000005f 00000000ffffff94 ffff88085fa03d40 ffffffff8110dc1c 000000009c52d75c Call Trace: <IRQ> [<ffffffff81603f36>] dump_stack+0x19/0x1b [<ffffffff8110d852>] __report_bad_irq+0x32/0xd0 [<ffffffff8110dc1c>] note_interrupt+0xdc/0x1f0 [<ffffffff8110b381>] handle_irq_event_percpu+0xe1/0x1e0 [<ffffffff8110b4bd>] handle_irq_event+0x3d/0x60 [<ffffffff8110e157>] handle_edge_irq+0x77/0x130 [<ffffffff81015c9f>] handle_irq+0xbf/0x150 [<ffffffff81077d27>] ? irq_enter+0x17/0xa0 [<ffffffff8161626f>] do_IRQ+0x4f/0xf0 [<ffffffff8160b4ed>] common_interrupt+0x6d/0x6d [<ffffffff810abe65>] ? sched_clock_cpu+0xb5/0x100 [<ffffffff81077adb>] ? __do_softirq+0x9b/0x280 [<ffffffff816156dc>] call_softirq+0x1c/0x30 [<ffffffff81015d95>] do_softirq+0x65/0xa0 [<ffffffff81077ec5>] irq_exit+0x115/0x120 [<ffffffff81616355>] smp_apic_timer_interrupt+0x45/0x60 [<ffffffff81614a1d>] apic_timer_interrupt+0x6d/0x80 <EOI> [<ffffffff8107089f>] ? vprintk_emit+0x1bf/0x530 [<ffffffff815fd9ef>] printk+0x77/0x8e [<ffffffffa0390340>] ? _scsih_suspend+0xb0/0xb0 [mpt3sas] [<ffffffffa038b144>] mpt3sas_port_enable+0x24/0x100 [mpt3sas] [<ffffffffa039038f>] _scsih_scan_start+0x4f/0x70 [mpt3sas] [<ffffffff813fc6c7>] do_scsi_scan_host+0x37/0xa0 [<ffffffff813fc8fc>] do_scan_async+0x1c/0x150 [<ffffffff8109e839>] async_run_entry_fn+0x39/0x120 [<ffffffff8108f0ab>] process_one_work+0x17b/0x470 [<ffffffff8108fe8b>] worker_thread+0x11b/0x400 [<ffffffff8108fd70>] ? rescuer_thread+0x400/0x400 [<ffffffff8109726f>] kthread+0xcf/0xe0 [<ffffffff810971a0>] ? kthread_create_on_node+0x140/0x140 [<ffffffff81613cfc>] ret_from_fork+0x7c/0xb0 [<ffffffff810971a0>] ? kthread_create_on_node+0x140/0x140 handlers: [<ffffffff81458c70>] xhci_msi_irq drivers/usb/host/xhci-ring.c : irqreturn_t xhci_irq(struct usb_hcd *hcd) { struct xhci_hcd *xhci = hcd_to_xhci(hcd); u32 status; u64 temp_64; union xhci_trb *event_ring_deq; dma_addr_t deq; spin_lock(&xhci->lock); /* Check if the xHC generated the interrupt, or the irq is shared */ status = readl(&xhci->op_regs->status); if (status == 0xffffffff) goto hw_died; if (!(status & STS_EINT)) { spin_unlock(&xhci->lock); return IRQ_NONE; } if (status & STS_FATAL) { xhci_warn(xhci, "WARNING: Host System Error\n"); xhci_halt(xhci); hw_died: spin_unlock(&xhci->lock); return -ESHUTDOWN; } ... So -ESHUTDOWN = -108 (0xffffff94) provoked bad_action_ret into reporting a bogus return value and stack trace above. This error message and stack trace has been reported in the past [1], and I was wondering about the value of the stack trace (at least in the hw_died case). A quick survey of the other host controller drivers that bother checking for 0xffffffff shows they all do something like this: if (status == ~(u32) 0) { ehci_dbg (ehci, "device removed\n"); goto dead; } ... dead: ... return IRQ_HANDLED; but xhci-ring.c looks to be the only one to return !IRQ_XXX from its irq handler. Granted, most folks aren't hotplugging USB host controllers, but this is supported on Stratus HW. Are there any options about the following patch? Regards, -- Joe [1] https://bugzilla.redhat.com/show_bug.cgi?id=692425 --->8-- -->8-- -->8-- >From ff69f1bb5601ce5f0e70bb2e97c65456e13dc38e Mon Sep 17 00:00:00 2001 From: Joe Lawrence <joe.lawrence@xxxxxxxxxxx> Date: Mon, 20 Apr 2015 11:14:47 -0400 Subject: [PATCH] xhci: gracefully handle xhci_irq dead device If the xHCI host controller has died (ie, device removed), then xhci_irq should quietly handle this condition with IRQ_HANDLED. Signed-off-by: Joe Lawrence <joe.lawrence@xxxxxxxxxxx> --- drivers/usb/host/xhci-ring.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index f5397a517c54..42bd3e2670bf 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -2626,11 +2626,13 @@ irqreturn_t xhci_irq(struct usb_hcd *hcd) dma_addr_t deq; spin_lock(&xhci->lock); - /* Check if the xHC generated the interrupt, or the irq is shared */ status = readl(&xhci->op_regs->status); - if (status == 0xffffffff) - goto hw_died; + if (status == 0xffffffff) { + spin_unlock(&xhci->lock); + return IRQ_HANDLED; + } + /* Check if the xHC generated the interrupt, or the irq is shared */ if (!(status & STS_EINT)) { spin_unlock(&xhci->lock); return IRQ_NONE; @@ -2638,7 +2640,6 @@ irqreturn_t xhci_irq(struct usb_hcd *hcd) if (status & STS_FATAL) { xhci_warn(xhci, "WARNING: Host System Error\n"); xhci_halt(xhci); -hw_died: spin_unlock(&xhci->lock); return -ESHUTDOWN; } -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-usb" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html