Re: Hitting "unused qh not empty" BUG in qh_destroy

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, 16 Sep 2014 11:56:14 -0400
Alan Stern <stern@xxxxxxxxxxxxxxxxxxx> wrote:

> On Tue, 16 Sep 2014, Joe Lawrence wrote:
> 
> > > Anyway, the log above means that a QH was linked before the HC died,
> > > but then it was never unlinked.  Please add a line at the start of
> > > ehci_endpoint_disable, printing out the value of ep->hcpriv for
> > > comparison with the values listed above.
> > 
> > Will do.
> > 
> > > Also, let's see the contents of *qh and *(qh->hw) for the left-over QH
> > > at *(ehci->async->qh_next.qh).  You can get those right away from your 
> > > crash dump data, right?
> > 
> > Yep, interestingly enough, *(ehci->async->qh_next.qh) appears to be
> > mostly full of zeros:
> 
> > crash> struct ehci_qh 0xffff88083d6b4558         << *(ehci->async->qh_next.qh)
> > struct ehci_qh {
> >   hw = 0x0,
> >   qh_dma = 0x0, 
> >   qh_next = {
> >     qh = 0x0, 
> >     itd = 0x0, 
> >     sitd = 0x0, 
> >     fstn = 0x0, 
> >     hw_next = 0x0, 
> >     ptr = 0x0
> >   }, 
> >   qtd_list = {
> >     next = 0x0, 
> >     prev = 0x0
> >   }, 
> >   intr_node = {
> >     next = 0x0, 
> >     prev = 0x0
> >   }, 
> >   dummy = 0x0, 
> >   unlink_node = {
> >     next = 0x0, 
> >     prev = 0x0
> >   }, 
> >   unlink_cycle = 0x6, 
> >   qh_state = 0x0, 
> >   xacterrs = 0x0, 
> >   usecs = 0x0, 
> >   gap_uf = 0x0, 
> >   c_usecs = 0x0, 
> >   tt_usecs = 0x0, 
> >   period = 0x0, 
> >   start = 0x0, 
> >   dev = 0x6, 
> >   is_out = 0x0, 
> >   clearing_tt = 0x0, 
> >   dequeue_during_giveback = 0x0, 
> >   exception = 0x0
> > }
> 
> So this QH was destroyed properly, but somehow the link pointer in
> ehci->async continued to point at it.  And it got destroyed without
> passing through single_unlink_async.
> 
> Maybe you should also add a check at the end of ehci_endpoint_disable, 
> just before the done: label.  For example,
> 
> 	BUG_ON(ehci->async->qh_next.qh == qh);
> 
> Then if you really want to be thorough, in single_unlink_async, just
> after the "while" loop, print the values of prev, prev->qh_next.qh, qh,
> and qh->qh_next.qh.  And in qh_link_async, just after the wmb(), print
> the values of head->qh_next.qh and qh.  (This could end up generating
> quite a lot of output...)
> 
> Alan Stern
> 

Quick turn around on this one :)

crash> log | grep 0000:2c:00.0

pci 0000:2c:00.0: [8086:1d26] type 00 class 0x0c0320
pci 0000:2c:00.0: reg 0x10: [mem 0x90000000-0x900003ff]
pci 0000:2c:00.0: PME# supported from D0 D3hot D3cold
ehci-pci 0000:2c:00.0: EHCI Host Controller
ehci-pci 0000:2c:00.0: new USB bus registered, assigned bus number 1
ehci-pci 0000:2c:00.0: debug port 2
ehci-pci 0000:2c:00.0: cache line size of 64 is not supported
ehci-pci 0000:2c:00.0: irq 10, io mem 0x90000000
ehci-pci 0000:2c:00.0: USB 2.0 started, EHCI 1.00
usb usb1: SerialNumber: 0000:2c:00.0
ehci-pci 0000:2c:00.0: qh_link_async:1003 ehci = ffff88084ff1b088, head->qh_next.qh =           (null), qh = ffff88083eacae50
ehci-pci 0000:2c:00.0: HC died; cleaning up
ehci-pci 0000:2c:00.0: ehci_endpoint_disable:944 ep->hcpriv = ffff88083eacae50, ehci(ffff88084ff1b088)->num_async = 1, ehci->async->qh_next.qh = ffff88083eacae50

------------[ cut here ]------------
kernel BUG at drivers/usb/host/ehci-hcd.c:1002!
invalid opcode: 0000 [#1] SMP 
...
CPU: 0 PID: 207 Comm: khubd Tainted: PF          O--------------   3.10.0-123.6.3.el7.bigphysarea_expedient1_usb4.x86_64 #1
Hardware name: Stratus ftServer 6400/G7LAZ, BIOS BIOS Version 6.3:57 12/25/2013
task: ffff880853ff8cb0 ti: ffff880853434000 task.ti: ffff880853434000
RIP: 0010:[<ffffffff814157c6>]  [<ffffffff814157c6>] ehci_endpoint_disable+0x1f6/0x200
RSP: 0018:ffff880853435ca0  EFLAGS: 00010046
RAX: ffff8800372cc720 RBX: ffff88083eacae50 RCX: dead000000200200
RDX: ffff88085f800710 RSI: 0000000000000047 RDI: 0000000000000047
RBP: ffff880853435ce0 R08: ffff88085f800710 R09: 0000000100230022
R10: ffff88085f807600 R11: ffffffff81412158 R12: ffff88084ff1ae68
R13: ffff88084ff1b174 R14: ffff880851914a88 R15: 0000000000000246
FS:  0000000000000000(0000) GS:ffff88085fc00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ffc501a2000 CR3: 0000000850402000 CR4: 00000000000407f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Stack:
 ffff880800000001 ffff88083eacae50 ffff88084ff1b088 ffff880851914a40
 ffff880851914a88 0000000000000001 ffff880851914a40 00000000ffffff00
 ffff880853435d00 ffffffff813fabf3 ffff880851914a40 ffff880851914a88
Call Trace:
 [<ffffffff813fabf3>] usb_hcd_disable_endpoint+0x33/0x40
 [<ffffffff813fd8b9>] usb_disable_endpoint+0x69/0x90
 [<ffffffff813effbc>] usb_ep0_reinit+0x1c/0x40
 [<ffffffff813f4fba>] hub_port_connect_change+0x33a/0xb50
 [<ffffffff813f5b49>] hub_events+0x379/0xa30
 [<ffffffff813f6245>] hub_thread+0x45/0x1a0
 [<ffffffff81086ac0>] ? wake_up_bit+0x30/0x30
 [<ffffffff813f6200>] ? hub_events+0xa30/0xa30
 [<ffffffff81085aff>] kthread+0xcf/0xe0
 [<ffffffff81085a30>] ? kthread_create_on_node+0x140/0x140
 [<ffffffff815f31ec>] ret_from_fork+0x7c/0xb0
 [<ffffffff81085a30>] ? kthread_create_on_node+0x140/0x140


Just to review, I pasted the USB related patches to this kernel below.  The
drivers/usb/core/hub.c changes include:

  d8521af "usb: assign default peer ports for root hubs" (maxchild parts)
  c605f3c "usb: hub: take hub->hdev reference when processing from eventlist"

the rest are debugging for the qh_destroy BUG.

-- Joe

diff -Nupr before/drivers/usb/core/hub.c after/drivers/usb/core/hub.c
--- before/drivers/usb/core/hub.c	2014-07-16 14:25:31.000000000 -0400
+++ after/drivers/usb/core/hub.c	2014-09-16 12:35:26.816249313 -0400
@@ -1304,6 +1304,7 @@ static int hub_configure(struct usb_hub
 	char *message = "out of memory";
 	unsigned unit_load;
 	unsigned full_load;
+	unsigned maxchild;
 
 	hub->buffer = kmalloc(sizeof(*hub->buffer), GFP_KERNEL);
 	if (!hub->buffer) {
@@ -1342,12 +1343,11 @@ static int hub_configure(struct usb_hub
 		goto fail;
 	}
 
-	hdev->maxchild = hub->descriptor->bNbrPorts;
-	dev_info (hub_dev, "%d port%s detected\n", hdev->maxchild,
-		(hdev->maxchild == 1) ? "" : "s");
+	maxchild = hub->descriptor->bNbrPorts;
+	dev_info (hub_dev, "%d port%s detected\n", maxchild,
+		(maxchild == 1) ? "" : "s");
 
-	hub->ports = kzalloc(hdev->maxchild * sizeof(struct usb_port *),
-			     GFP_KERNEL);
+	hub->ports = kzalloc(maxchild * sizeof(struct usb_port *), GFP_KERNEL);
 	if (!hub->ports) {
 		ret = -ENOMEM;
 		goto fail;
@@ -1368,11 +1368,11 @@ static int hub_configure(struct usb_hub
 		int	i;
 		char	portstr [USB_MAXCHILDREN + 1];
 
-		for (i = 0; i < hdev->maxchild; i++)
+		for (i = 0; i < maxchild; i++)
 			portstr[i] = hub->descriptor->u.hs.DeviceRemovable
 				    [((i + 1) / 8)] & (1 << ((i + 1) % 8))
 				? 'F' : 'R';
-		portstr[hdev->maxchild] = 0;
+		portstr[maxchild] = 0;
 		dev_dbg(hub_dev, "compound device; port removable status: %s\n", portstr);
 	} else
 		dev_dbg(hub_dev, "standalone hub\n");
@@ -1484,7 +1484,7 @@ static int hub_configure(struct usb_hub
 		if (hcd->power_budget > 0)
 			hdev->bus_mA = hcd->power_budget;
 		else
-			hdev->bus_mA = full_load * hdev->maxchild;
+			hdev->bus_mA = full_load * maxchild;
 		if (hdev->bus_mA >= full_load)
 			hub->mA_per_port = full_load;
 		else {
@@ -1499,7 +1499,7 @@ static int hub_configure(struct usb_hub
 			hub->descriptor->bHubContrCurrent);
 		hub->limited_power = 1;
 
-		if (remaining < hdev->maxchild * unit_load)
+		if (remaining < maxchild * unit_load)
 			dev_warn(hub_dev,
 					"insufficient power available "
 					"to use all downstream ports\n");
@@ -1567,15 +1567,17 @@ static int hub_configure(struct usb_hub
 	if (hub->has_indicators && blinkenlights)
 		hub->indicator [0] = INDICATOR_CYCLE;
 
-	for (i = 0; i < hdev->maxchild; i++) {
+	for (i = 0; i < maxchild; i++) {
 		ret = usb_hub_create_port_device(hub, i + 1);
 		if (ret < 0) {
 			dev_err(hub->intfdev,
 				"couldn't create port%d device.\n", i + 1);
-			hdev->maxchild = i;
-			goto fail_keep_maxchild;
+			break;
 		}
 	}
+	hdev->maxchild = i;
+	if (ret < 0)
+		goto fail;
 
 	usb_hub_adjust_deviceremovable(hdev, hub->descriptor);
 
@@ -1583,8 +1585,6 @@ static int hub_configure(struct usb_hub
 	return 0;
 
 fail:
-	hdev->maxchild = 0;
-fail_keep_maxchild:
 	dev_err (hub_dev, "config failed, %s (err %d)\n",
 			message, ret);
 	/* hub_disconnect() frees urb and descriptor */
@@ -4650,9 +4650,10 @@ static void hub_events(void)
 
 		hub = list_entry(tmp, struct usb_hub, event_list);
 		kref_get(&hub->kref);
+		hdev = hub->hdev;
+		usb_get_dev(hdev);
 		spin_unlock_irq(&hub_event_lock);
 
-		hdev = hub->hdev;
 		hub_dev = hub->intfdev;
 		intf = to_usb_interface(hub_dev);
 		dev_dbg(hub_dev, "state %d ports %d chg %04x evt %04x\n",
@@ -4864,6 +4865,7 @@ static void hub_events(void)
 		usb_autopm_put_interface(intf);
  loop_disconnected:
 		usb_unlock_device(hdev);
+		usb_put_dev(hdev);
 		kref_put(&hub->kref, hub_release);
 
         } /* end while (1) */
diff -Nupr before/drivers/usb/host/ehci.h after/drivers/usb/host/ehci.h
--- before/drivers/usb/host/ehci.h	2014-07-16 14:25:31.000000000 -0400
+++ after/drivers/usb/host/ehci.h	2014-09-16 12:35:26.836249471 -0400
@@ -229,6 +229,8 @@ struct ehci_hcd {			/* one per controlle
 	struct dentry		*debug_dir;
 #endif
 
+	int num_async;
+
 	/* platform-specific data -- must come last */
 	unsigned long		priv[0] __aligned(sizeof(s64));
 };
diff -Nupr before/drivers/usb/host/ehci-hcd.c after/drivers/usb/host/ehci-hcd.c
--- before/drivers/usb/host/ehci-hcd.c	2014-07-16 14:25:31.000000000 -0400
+++ after/drivers/usb/host/ehci-hcd.c	2014-09-16 12:42:24.603336165 -0400
@@ -417,6 +417,7 @@ static void ehci_stop (struct usb_hcd *h
 
 	ehci_dbg (ehci, "stop\n");
 
+	ehci_info(ehci, "%s:%d ehci(%p)->num_async = %d, ehci->async->qh_next.qh = %p\n", __func__, __LINE__, ehci, ehci->num_async, ehci->async->qh_next.qh);
 	/* no more interrupts ... */
 
 	spin_lock_irq(&ehci->lock);
@@ -940,6 +941,8 @@ ehci_endpoint_disable (struct usb_hcd *h
 	/* ASSERT:  any requests/urbs are being unlinked */
 	/* ASSERT:  nobody can be submitting urbs for this any more */
 
+	ehci_info(ehci, "%s:%d ep->hcpriv = %p, ehci(%p)->num_async = %d, ehci->async->qh_next.qh = %p\n", __func__, __LINE__, ep->hcpriv, ehci, ehci->num_async, ehci->async->qh_next.qh);
+
 rescan:
 	spin_lock_irqsave (&ehci->lock, flags);
 	qh = ep->hcpriv;
@@ -995,6 +998,9 @@ idle_timeout:
 			list_empty (&qh->qtd_list) ? "" : "(has tds)");
 		break;
 	}
+
+	BUG_ON(ehci->async->qh_next.qh == qh);
+
  done:
 	ep->hcpriv = NULL;
 	spin_unlock_irqrestore (&ehci->lock, flags);
diff -Nupr before/drivers/usb/host/ehci-q.c after/drivers/usb/host/ehci-q.c
--- before/drivers/usb/host/ehci-q.c	2014-07-16 14:25:31.000000000 -0400
+++ after/drivers/usb/host/ehci-q.c	2014-09-16 12:42:21.553284025 -0400
@@ -959,6 +959,25 @@ static void disable_async(struct ehci_hc
 	ehci_poll_ASS(ehci);
 }
 
+static void check_async_ring(struct ehci_hcd *ehci, int add)
+{
+	struct ehci_qh *qh;
+	int n;
+
+	qh = ehci->async->qh_next.qh;
+	n = ehci->num_async += add;
+
+	while (qh && n > 0) {
+		qh = qh->qh_next.qh;
+		--n;
+	}
+	if (qh || n != 0) {
+		ehci_err(ehci, "EHCI async list corrupted: num %d n %d qh %p\n",
+				ehci->num_async, n, qh);
+		BUG();
+	}
+}
+
 /* move qh (and its qtds) onto async queue; maybe enable queue.  */
 
 static void qh_link_async (struct ehci_hcd *ehci, struct ehci_qh *qh)
@@ -981,6 +1000,8 @@ static void qh_link_async (struct ehci_h
 	qh->hw->hw_next = head->hw->hw_next;
 	wmb ();
 
+	ehci_info(ehci, "%s:%d ehci = %p, head->qh_next.qh = %p, qh = %p\n", __func__, __LINE__, ehci, head->qh_next.qh, qh);
+
 	head->qh_next.qh = qh;
 	head->hw->hw_next = dma;
 
@@ -990,6 +1011,8 @@ static void qh_link_async (struct ehci_h
 	/* qtd completions reported later by interrupt */
 
 	enable_async(ehci);
+
+	check_async_ring(ehci, 1);
 }
 
 /*-------------------------------------------------------------------------*/
@@ -1254,10 +1277,14 @@ static void single_unlink_async(struct e
 	while (prev->qh_next.qh != qh)
 		prev = prev->qh_next.qh;
 
+	ehci_info(ehci, "%s:%d ehci = %p prev = %p, prev->qh_next.qh = %p, qh = %p, qh->qh_next.qh = %p\n", __func__, __LINE__, ehci, prev, prev->qh_next.qh, qh, qh->qh_next.qh);
+
 	prev->hw->hw_next = qh->hw->hw_next;
 	prev->qh_next = qh->qh_next;
 	if (ehci->qh_scan_next == qh)
 		ehci->qh_scan_next = qh->qh_next.qh;
+
+	check_async_ring(ehci, -1);
 }
 
 static void start_iaa_cycle(struct ehci_hcd *ehci)


--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Media]     [Linux Input]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [Old Linux USB Devel Archive]

  Powered by Linux