Re: xhci_hcd 0000:00:14.0: ERROR Transfer event TRB DMA ptr not part of current TD ep_index 2 comp_code 1

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 20.07.2015 23:13, Arkadiusz Miskiewicz wrote:
> On Saturday 18 of July 2015, Arkadiusz Miskiewicz wrote:
>> Hi.
>>
>> I'm on 4.2.0-rc2-00077-gf760b87 kernel and while trying to copy some file
>> from usb storage (sata disk behind sata-usb bridge or pendrive; hapens in
>> both cases) copying process hangs just early after start with:
> 
> Looks like suspend & resume is enough. Reloading bluetooth firmware done by kernel
> triggers problem:
> 
> [  106.302783] rtc_cmos 00:02: System wakeup disabled by ACPI
> [  106.313280] PM: resume of devices complete after 3003.032 msecs
> [  106.314079] Restarting tasks ... done.
> [  106.326434] Bluetooth: hci0: read Intel version: 370710018002030d00
> [  106.330422] Bluetooth: hci0: Intel Bluetooth firmware file: intel/ibt-hw-37.7.10-fw-1.80.2.3.d.bseq
> [  106.398223] xhci_hcd 0000:00:14.0: ERROR Transfer event TRB DMA ptr not part of current TD ep_index 0 comp_code 1

Looks like we get an event for a really old transfer for some reason, it should probably be handled already.

I got a patch that adds more paranoid checks for TRB cancel, which has been one major reasons
for the "Transfer event TRB DMA ptr not part of current TD" Errors. It also adds some logging
to show what's went wrong. (patch attached, applies on 4.2-rc3) Can you see if it helps?

If it doesn't, then adding xhci debugging could give us some clue:
echo -n 'module xhci_hcd =p' > /sys/kernel/debug/dynamic_debug/control

You said 3.19.3 works fine, but 4.0.8 and 4.2-rc2 fail, any chance you could 
bisect it?

Thanks

-Mathias   
  

>From cd27574451792569dff002ab33148cbaf9d52faf Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman@xxxxxxxxxxxxxxx>
Date: Tue, 25 Nov 2014 17:35:27 +0200
Subject: [PATCH] xhci: Don't touch URB TD memory if they are no longer on the
 endpoint ring

If a URB is cancelled we want to make sure the URB's TRBs still point
to memory on the endpoint ring. If the ring was already dropped then that
TRB may point to memory already in use by another ring, or freed.

Signed-off-by: Mathias Nyman <mathias.nyman@xxxxxxxxxxxxxxx>
---
 drivers/usb/host/xhci-ring.c | 33 ++++++++++++++++++++++++++++++---
 drivers/usb/host/xhci.c      | 29 ++++++++++++++++++++++++++++-
 drivers/usb/host/xhci.h      |  1 +
 3 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 94416ff..1e46d4f 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -136,6 +136,25 @@ static void next_trb(struct xhci_hcd *xhci,
 	}
 }
 
+/* check if the TD is on the ring */
+bool xhci_td_on_ring(struct xhci_td *td, struct xhci_ring *ring)
+{
+	struct xhci_segment *seg;
+
+	if (!td->start_seg || !ring || !ring->first_seg)
+		return false;
+
+	seg = ring->first_seg;
+	do {
+		if (td->start_seg == seg)
+			return true;
+		seg = seg->next;
+	} while (seg != ring->first_seg);
+
+	return false;
+}
+
+
 /*
  * See Cycle bit rules. SW is the consumer for the event ring only.
  * Don't make a ring full of link TRBs.  That would be dumb and this would loop.
@@ -685,10 +704,16 @@ static void xhci_handle_cmd_stop_ep(struct xhci_hcd *xhci, int slot_id,
 					cur_td->urb->stream_id);
 			goto remove_finished_td;
 		}
-		/*
-		 * If we stopped on the TD we need to cancel, then we have to
-		 * move the xHC endpoint ring dequeue pointer past this TD.
+		/* In case ring was dropped and segments freed or cached we
+		 * don't want to touch that memory anymore, or, if we stopped
+		 * on the TD we want to remove we need to move the dq pointer
+		 * past this TD, otherwise just turn TD to no-op
 		 */
+		if (!xhci_td_on_ring(cur_td, ep_ring)) {
+			xhci_err(xhci, "Cancelled TD not on stopped ring\n");
+			goto remove_finished_td;
+		}
+
 		if (cur_td == ep->stopped_td)
 			xhci_find_new_dequeue_state(xhci, slot_id, ep_index,
 					cur_td->urb->stream_id,
@@ -1295,11 +1320,13 @@ static void handle_cmd_completion(struct xhci_hcd *xhci,
 	/* Is the command ring deq ptr out of sync with the deq seg ptr? */
 	if (cmd_dequeue_dma == 0) {
 		xhci->error_bitmask |= 1 << 4;
+		xhci_err(xhci, "Command completion ptr and seg out of sync\n");
 		return;
 	}
 	/* Does the DMA address match our internal dequeue pointer address? */
 	if (cmd_dma != (u64) cmd_dequeue_dma) {
 		xhci->error_bitmask |= 1 << 5;
+		xhci_err(xhci, "Command completion DMA address mismatch\n");
 		return;
 	}
 
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 7da0d60..d72b46e 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -1527,6 +1527,7 @@ int xhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
 	struct xhci_ring *ep_ring;
 	struct xhci_virt_ep *ep;
 	struct xhci_command *command;
+	bool remove_td_from_ring = false;
 
 	xhci = hcd_to_xhci(hcd);
 	spin_lock_irqsave(&xhci->lock, flags);
@@ -1587,9 +1588,28 @@ int xhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
 					urb_priv->td[i]->start_seg,
 					urb_priv->td[i]->first_trb));
 
+	/* make sure the TDs of the URB are still on the endpoint ring */
 	for (; i < urb_priv->length; i++) {
 		td = urb_priv->td[i];
-		list_add_tail(&td->cancelled_td_list, &ep->cancelled_td_list);
+		if (xhci_td_on_ring(td, ep_ring)) {
+			list_add_tail(&td->cancelled_td_list,
+				      &ep->cancelled_td_list);
+			remove_td_from_ring = true;
+		} else {
+			xhci_err(xhci, "Cancel URB NOT on current ring\n");
+			if (!list_empty(&td->td_list))
+				list_del_init(&td->td_list);
+		}
+	}
+
+
+	/* No need to stop the ring to remove the TD if it isn't on the ring */
+	if (!remove_td_from_ring) {
+		xhci_urb_free_priv(urb_priv);
+		usb_hcd_unlink_urb_from_ep(hcd, urb);
+		spin_unlock_irqrestore(&xhci->lock, flags);
+		usb_hcd_giveback_urb(hcd, urb, 0);
+		return ret;
 	}
 
 	/* Queue a stop endpoint command, but only if this is
@@ -3555,6 +3575,13 @@ int xhci_discover_or_reset_device(struct usb_hcd *hcd, struct usb_device *udev)
 			ep->ep_state &= ~EP_HAS_STREAMS;
 		}
 
+		if (!list_empty(&ep->cancelled_td_list))
+			xhci_err(xhci, "Error unhandled cancelled TD's after dev reset\n");
+
+		if (ep->ring && !list_empty(&ep->ring->td_list))
+			xhci_err(xhci, "Error unhandled TD's after dev reset\n");
+
+
 		if (ep->ring) {
 			xhci_free_or_cache_endpoint_ring(xhci, virt_dev, i);
 			last_freed_endpoint = i;
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index 31e46cc..efb504c 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -1808,6 +1808,7 @@ void xhci_reset_bandwidth(struct usb_hcd *hcd, struct usb_device *udev);
 
 /* xHCI ring, segment, TRB, and TD functions */
 dma_addr_t xhci_trb_virt_to_dma(struct xhci_segment *seg, union xhci_trb *trb);
+bool xhci_td_on_ring(struct xhci_td *td, struct xhci_ring *ring);
 struct xhci_segment *trb_in_td(struct xhci_hcd *xhci,
 		struct xhci_segment *start_seg, union xhci_trb *start_trb,
 		union xhci_trb *end_trb, dma_addr_t suspect_dma, bool debug);
-- 
1.8.3.2


[Index of Archives]     [Linux Media]     [Linux Input]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [Old Linux USB Devel Archive]

  Powered by Linux