On Tue, 16 Sep 2014 11:56:14 -0400 Alan Stern <stern@xxxxxxxxxxxxxxxxxxx> wrote: > On Tue, 16 Sep 2014, Joe Lawrence wrote: > > > > Anyway, the log above means that a QH was linked before the HC died, > > > but then it was never unlinked. Please add a line at the start of > > > ehci_endpoint_disable, printing out the value of ep->hcpriv for > > > comparison with the values listed above. > > > > Will do. > > > > > Also, let's see the contents of *qh and *(qh->hw) for the left-over QH > > > at *(ehci->async->qh_next.qh). You can get those right away from your > > > crash dump data, right? > > > > Yep, interestingly enough, *(ehci->async->qh_next.qh) appears to be > > mostly full of zeros: > > > crash> struct ehci_qh 0xffff88083d6b4558 << *(ehci->async->qh_next.qh) > > struct ehci_qh { > > hw = 0x0, > > qh_dma = 0x0, > > qh_next = { > > qh = 0x0, > > itd = 0x0, > > sitd = 0x0, > > fstn = 0x0, > > hw_next = 0x0, > > ptr = 0x0 > > }, > > qtd_list = { > > next = 0x0, > > prev = 0x0 > > }, > > intr_node = { > > next = 0x0, > > prev = 0x0 > > }, > > dummy = 0x0, > > unlink_node = { > > next = 0x0, > > prev = 0x0 > > }, > > unlink_cycle = 0x6, > > qh_state = 0x0, > > xacterrs = 0x0, > > usecs = 0x0, > > gap_uf = 0x0, > > c_usecs = 0x0, > > tt_usecs = 0x0, > > period = 0x0, > > start = 0x0, > > dev = 0x6, > > is_out = 0x0, > > clearing_tt = 0x0, > > dequeue_during_giveback = 0x0, > > exception = 0x0 > > } > > So this QH was destroyed properly, but somehow the link pointer in > ehci->async continued to point at it. And it got destroyed without > passing through single_unlink_async. > > Maybe you should also add a check at the end of ehci_endpoint_disable, > just before the done: label. For example, > > BUG_ON(ehci->async->qh_next.qh == qh); > > Then if you really want to be thorough, in single_unlink_async, just > after the "while" loop, print the values of prev, prev->qh_next.qh, qh, > and qh->qh_next.qh. And in qh_link_async, just after the wmb(), print > the values of head->qh_next.qh and qh. (This could end up generating > quite a lot of output...) > > Alan Stern > Quick turn around on this one :) crash> log | grep 0000:2c:00.0 pci 0000:2c:00.0: [8086:1d26] type 00 class 0x0c0320 pci 0000:2c:00.0: reg 0x10: [mem 0x90000000-0x900003ff] pci 0000:2c:00.0: PME# supported from D0 D3hot D3cold ehci-pci 0000:2c:00.0: EHCI Host Controller ehci-pci 0000:2c:00.0: new USB bus registered, assigned bus number 1 ehci-pci 0000:2c:00.0: debug port 2 ehci-pci 0000:2c:00.0: cache line size of 64 is not supported ehci-pci 0000:2c:00.0: irq 10, io mem 0x90000000 ehci-pci 0000:2c:00.0: USB 2.0 started, EHCI 1.00 usb usb1: SerialNumber: 0000:2c:00.0 ehci-pci 0000:2c:00.0: qh_link_async:1003 ehci = ffff88084ff1b088, head->qh_next.qh = (null), qh = ffff88083eacae50 ehci-pci 0000:2c:00.0: HC died; cleaning up ehci-pci 0000:2c:00.0: ehci_endpoint_disable:944 ep->hcpriv = ffff88083eacae50, ehci(ffff88084ff1b088)->num_async = 1, ehci->async->qh_next.qh = ffff88083eacae50 ------------[ cut here ]------------ kernel BUG at drivers/usb/host/ehci-hcd.c:1002! invalid opcode: 0000 [#1] SMP ... CPU: 0 PID: 207 Comm: khubd Tainted: PF O-------------- 3.10.0-123.6.3.el7.bigphysarea_expedient1_usb4.x86_64 #1 Hardware name: Stratus ftServer 6400/G7LAZ, BIOS BIOS Version 6.3:57 12/25/2013 task: ffff880853ff8cb0 ti: ffff880853434000 task.ti: ffff880853434000 RIP: 0010:[<ffffffff814157c6>] [<ffffffff814157c6>] ehci_endpoint_disable+0x1f6/0x200 RSP: 0018:ffff880853435ca0 EFLAGS: 00010046 RAX: ffff8800372cc720 RBX: ffff88083eacae50 RCX: dead000000200200 RDX: ffff88085f800710 RSI: 0000000000000047 RDI: 0000000000000047 RBP: ffff880853435ce0 R08: ffff88085f800710 R09: 0000000100230022 R10: ffff88085f807600 R11: ffffffff81412158 R12: ffff88084ff1ae68 R13: ffff88084ff1b174 R14: ffff880851914a88 R15: 0000000000000246 FS: 0000000000000000(0000) GS:ffff88085fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffc501a2000 CR3: 0000000850402000 CR4: 00000000000407f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Stack: ffff880800000001 ffff88083eacae50 ffff88084ff1b088 ffff880851914a40 ffff880851914a88 0000000000000001 ffff880851914a40 00000000ffffff00 ffff880853435d00 ffffffff813fabf3 ffff880851914a40 ffff880851914a88 Call Trace: [<ffffffff813fabf3>] usb_hcd_disable_endpoint+0x33/0x40 [<ffffffff813fd8b9>] usb_disable_endpoint+0x69/0x90 [<ffffffff813effbc>] usb_ep0_reinit+0x1c/0x40 [<ffffffff813f4fba>] hub_port_connect_change+0x33a/0xb50 [<ffffffff813f5b49>] hub_events+0x379/0xa30 [<ffffffff813f6245>] hub_thread+0x45/0x1a0 [<ffffffff81086ac0>] ? wake_up_bit+0x30/0x30 [<ffffffff813f6200>] ? hub_events+0xa30/0xa30 [<ffffffff81085aff>] kthread+0xcf/0xe0 [<ffffffff81085a30>] ? kthread_create_on_node+0x140/0x140 [<ffffffff815f31ec>] ret_from_fork+0x7c/0xb0 [<ffffffff81085a30>] ? kthread_create_on_node+0x140/0x140 Just to review, I pasted the USB related patches to this kernel below. The drivers/usb/core/hub.c changes include: d8521af "usb: assign default peer ports for root hubs" (maxchild parts) c605f3c "usb: hub: take hub->hdev reference when processing from eventlist" the rest are debugging for the qh_destroy BUG. -- Joe diff -Nupr before/drivers/usb/core/hub.c after/drivers/usb/core/hub.c --- before/drivers/usb/core/hub.c 2014-07-16 14:25:31.000000000 -0400 +++ after/drivers/usb/core/hub.c 2014-09-16 12:35:26.816249313 -0400 @@ -1304,6 +1304,7 @@ static int hub_configure(struct usb_hub char *message = "out of memory"; unsigned unit_load; unsigned full_load; + unsigned maxchild; hub->buffer = kmalloc(sizeof(*hub->buffer), GFP_KERNEL); if (!hub->buffer) { @@ -1342,12 +1343,11 @@ static int hub_configure(struct usb_hub goto fail; } - hdev->maxchild = hub->descriptor->bNbrPorts; - dev_info (hub_dev, "%d port%s detected\n", hdev->maxchild, - (hdev->maxchild == 1) ? "" : "s"); + maxchild = hub->descriptor->bNbrPorts; + dev_info (hub_dev, "%d port%s detected\n", maxchild, + (maxchild == 1) ? "" : "s"); - hub->ports = kzalloc(hdev->maxchild * sizeof(struct usb_port *), - GFP_KERNEL); + hub->ports = kzalloc(maxchild * sizeof(struct usb_port *), GFP_KERNEL); if (!hub->ports) { ret = -ENOMEM; goto fail; @@ -1368,11 +1368,11 @@ static int hub_configure(struct usb_hub int i; char portstr [USB_MAXCHILDREN + 1]; - for (i = 0; i < hdev->maxchild; i++) + for (i = 0; i < maxchild; i++) portstr[i] = hub->descriptor->u.hs.DeviceRemovable [((i + 1) / 8)] & (1 << ((i + 1) % 8)) ? 'F' : 'R'; - portstr[hdev->maxchild] = 0; + portstr[maxchild] = 0; dev_dbg(hub_dev, "compound device; port removable status: %s\n", portstr); } else dev_dbg(hub_dev, "standalone hub\n"); @@ -1484,7 +1484,7 @@ static int hub_configure(struct usb_hub if (hcd->power_budget > 0) hdev->bus_mA = hcd->power_budget; else - hdev->bus_mA = full_load * hdev->maxchild; + hdev->bus_mA = full_load * maxchild; if (hdev->bus_mA >= full_load) hub->mA_per_port = full_load; else { @@ -1499,7 +1499,7 @@ static int hub_configure(struct usb_hub hub->descriptor->bHubContrCurrent); hub->limited_power = 1; - if (remaining < hdev->maxchild * unit_load) + if (remaining < maxchild * unit_load) dev_warn(hub_dev, "insufficient power available " "to use all downstream ports\n"); @@ -1567,15 +1567,17 @@ static int hub_configure(struct usb_hub if (hub->has_indicators && blinkenlights) hub->indicator [0] = INDICATOR_CYCLE; - for (i = 0; i < hdev->maxchild; i++) { + for (i = 0; i < maxchild; i++) { ret = usb_hub_create_port_device(hub, i + 1); if (ret < 0) { dev_err(hub->intfdev, "couldn't create port%d device.\n", i + 1); - hdev->maxchild = i; - goto fail_keep_maxchild; + break; } } + hdev->maxchild = i; + if (ret < 0) + goto fail; usb_hub_adjust_deviceremovable(hdev, hub->descriptor); @@ -1583,8 +1585,6 @@ static int hub_configure(struct usb_hub return 0; fail: - hdev->maxchild = 0; -fail_keep_maxchild: dev_err (hub_dev, "config failed, %s (err %d)\n", message, ret); /* hub_disconnect() frees urb and descriptor */ @@ -4650,9 +4650,10 @@ static void hub_events(void) hub = list_entry(tmp, struct usb_hub, event_list); kref_get(&hub->kref); + hdev = hub->hdev; + usb_get_dev(hdev); spin_unlock_irq(&hub_event_lock); - hdev = hub->hdev; hub_dev = hub->intfdev; intf = to_usb_interface(hub_dev); dev_dbg(hub_dev, "state %d ports %d chg %04x evt %04x\n", @@ -4864,6 +4865,7 @@ static void hub_events(void) usb_autopm_put_interface(intf); loop_disconnected: usb_unlock_device(hdev); + usb_put_dev(hdev); kref_put(&hub->kref, hub_release); } /* end while (1) */ diff -Nupr before/drivers/usb/host/ehci.h after/drivers/usb/host/ehci.h --- before/drivers/usb/host/ehci.h 2014-07-16 14:25:31.000000000 -0400 +++ after/drivers/usb/host/ehci.h 2014-09-16 12:35:26.836249471 -0400 @@ -229,6 +229,8 @@ struct ehci_hcd { /* one per controlle struct dentry *debug_dir; #endif + int num_async; + /* platform-specific data -- must come last */ unsigned long priv[0] __aligned(sizeof(s64)); }; diff -Nupr before/drivers/usb/host/ehci-hcd.c after/drivers/usb/host/ehci-hcd.c --- before/drivers/usb/host/ehci-hcd.c 2014-07-16 14:25:31.000000000 -0400 +++ after/drivers/usb/host/ehci-hcd.c 2014-09-16 12:42:24.603336165 -0400 @@ -417,6 +417,7 @@ static void ehci_stop (struct usb_hcd *h ehci_dbg (ehci, "stop\n"); + ehci_info(ehci, "%s:%d ehci(%p)->num_async = %d, ehci->async->qh_next.qh = %p\n", __func__, __LINE__, ehci, ehci->num_async, ehci->async->qh_next.qh); /* no more interrupts ... */ spin_lock_irq(&ehci->lock); @@ -940,6 +941,8 @@ ehci_endpoint_disable (struct usb_hcd *h /* ASSERT: any requests/urbs are being unlinked */ /* ASSERT: nobody can be submitting urbs for this any more */ + ehci_info(ehci, "%s:%d ep->hcpriv = %p, ehci(%p)->num_async = %d, ehci->async->qh_next.qh = %p\n", __func__, __LINE__, ep->hcpriv, ehci, ehci->num_async, ehci->async->qh_next.qh); + rescan: spin_lock_irqsave (&ehci->lock, flags); qh = ep->hcpriv; @@ -995,6 +998,9 @@ idle_timeout: list_empty (&qh->qtd_list) ? "" : "(has tds)"); break; } + + BUG_ON(ehci->async->qh_next.qh == qh); + done: ep->hcpriv = NULL; spin_unlock_irqrestore (&ehci->lock, flags); diff -Nupr before/drivers/usb/host/ehci-q.c after/drivers/usb/host/ehci-q.c --- before/drivers/usb/host/ehci-q.c 2014-07-16 14:25:31.000000000 -0400 +++ after/drivers/usb/host/ehci-q.c 2014-09-16 12:42:21.553284025 -0400 @@ -959,6 +959,25 @@ static void disable_async(struct ehci_hc ehci_poll_ASS(ehci); } +static void check_async_ring(struct ehci_hcd *ehci, int add) +{ + struct ehci_qh *qh; + int n; + + qh = ehci->async->qh_next.qh; + n = ehci->num_async += add; + + while (qh && n > 0) { + qh = qh->qh_next.qh; + --n; + } + if (qh || n != 0) { + ehci_err(ehci, "EHCI async list corrupted: num %d n %d qh %p\n", + ehci->num_async, n, qh); + BUG(); + } +} + /* move qh (and its qtds) onto async queue; maybe enable queue. */ static void qh_link_async (struct ehci_hcd *ehci, struct ehci_qh *qh) @@ -981,6 +1000,8 @@ static void qh_link_async (struct ehci_h qh->hw->hw_next = head->hw->hw_next; wmb (); + ehci_info(ehci, "%s:%d ehci = %p, head->qh_next.qh = %p, qh = %p\n", __func__, __LINE__, ehci, head->qh_next.qh, qh); + head->qh_next.qh = qh; head->hw->hw_next = dma; @@ -990,6 +1011,8 @@ static void qh_link_async (struct ehci_h /* qtd completions reported later by interrupt */ enable_async(ehci); + + check_async_ring(ehci, 1); } /*-------------------------------------------------------------------------*/ @@ -1254,10 +1277,14 @@ static void single_unlink_async(struct e while (prev->qh_next.qh != qh) prev = prev->qh_next.qh; + ehci_info(ehci, "%s:%d ehci = %p prev = %p, prev->qh_next.qh = %p, qh = %p, qh->qh_next.qh = %p\n", __func__, __LINE__, ehci, prev, prev->qh_next.qh, qh, qh->qh_next.qh); + prev->hw->hw_next = qh->hw->hw_next; prev->qh_next = qh->qh_next; if (ehci->qh_scan_next == qh) ehci->qh_scan_next = qh->qh_next.qh; + + check_async_ring(ehci, -1); } static void start_iaa_cycle(struct ehci_hcd *ehci) -- To unsubscribe from this list: send the line "unsubscribe linux-usb" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html