Re: Strange issues with UAS URB cancellation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

[ I changed the title a little. And please use "reply to all" when
responding to the list, so that everybody involved in the thread gets
the response directly, regardless of linux-usb subscription status. ]

This 6.11-rc5 result is still not looking great, and neither is the
non-UAS 5.10.223 for that matter. These xHCI errors shouldn't be there;
it looks like there are still some bugs in URB cancellation code or in
the hardware maybe.

I'm not very familiar with streams stuff, but I remember having similar
symptoms on isochronous devices due to a particular hardware bug (not
on ASMedia though) and due to some event processing bugs.

I can offer this patch for testing (on 6.11-rc5), which tries to:
1. work around (hypothetical) similar HW bugs on non-NEC controllers
2. generally add more error checking in this area
3. log all transfer completion events and print the log on errors

This may shed more light on what's going on and why things are failing.
Knowing xHCI driver, in each of those cases it probably gets completely
stuck, until SCSI layer loses patience 30 seconds later and requests a
reset of the storage device.

Regards,
Michal
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 4ea2c3e072a9..0acdcf4d28db 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -814,11 +814,16 @@ static void xhci_unmap_td_bounce_buffer(struct xhci_hcd *xhci,
 	seg->bounce_offs = 0;
 }
 
-static int xhci_td_cleanup(struct xhci_hcd *xhci, struct xhci_td *td,
-			   struct xhci_ring *ep_ring, int status)
+static int __xhci_td_cleanup(struct xhci_hcd *xhci, struct xhci_td *td,
+			   struct xhci_ring *ep_ring, int status, const char *from)
 {
 	struct urb *urb = NULL;
 
+	ep_ring->cleanup_first_dma = xhci_trb_virt_to_dma(td->start_seg, td->first_trb);
+	ep_ring->cleanup_last_dma = xhci_trb_virt_to_dma(td->last_trb_seg, td->last_trb);
+	ep_ring->cleanup_status = status;
+	ep_ring->cleanup_from = from;
+
 	/* Clean up the endpoint's TD list */
 	urb = td->urb;
 
@@ -862,6 +867,8 @@ static int xhci_td_cleanup(struct xhci_hcd *xhci, struct xhci_td *td,
 	return 0;
 }
 
+#define xhci_td_cleanup(xhci, td, ep_ring, status) __xhci_td_cleanup(xhci, td, ep_ring, status, __func__)
+
 
 /* Complete the cancelled URBs we unlinked from td_list. */
 static void xhci_giveback_invalidated_tds(struct xhci_virt_ep *ep)
@@ -1153,14 +1160,13 @@ static void xhci_handle_cmd_stop_ep(struct xhci_hcd *xhci, int slot_id,
 			/*
 			 * NEC uPD720200 sometimes sets this state and fails with
 			 * Context Error while continuing to process TRBs.
-			 * Be conservative and trust EP_CTX_STATE on other chips.
 			 */
-			if (!(xhci->quirks & XHCI_NEC_HOST))
-				break;
+			xhci_info(xhci, "Stop Endpoint Context Error & Stopped on slot %d ep %d\n",
+					slot_id, ep_index);
 			fallthrough;
 		case EP_STATE_RUNNING:
 			/* Race, HW handled stop ep cmd before ep was running */
-			xhci_dbg(xhci, "Stop ep completion ctx error, ep is running\n");
+			xhci_info(xhci, "Stop ep completion ctx error, ep is running\n");
 
 			command = xhci_alloc_command(xhci, false, GFP_ATOMIC);
 			if (!command) {
@@ -1176,6 +1182,11 @@ static void xhci_handle_cmd_stop_ep(struct xhci_hcd *xhci, int slot_id,
 		}
 	}
 
+	int state = GET_EP_CTX_STATE(ep_ctx);
+	if (comp_code != COMP_SUCCESS || state != EP_STATE_STOPPED)
+		xhci_err(xhci, "Stop Endpoint on slot %d ep %d failed with comp %d ctx_state %d\n",
+				slot_id, ep_index, comp_code, state);
+
 	/* will queue a set TR deq if stopped on a cancelled, uncleared TD */
 	xhci_invalidate_cancelled_tds(ep);
 	ep->ep_state &= ~EP_STOP_CMD_PENDING;
@@ -2636,6 +2647,11 @@ static int handle_tx_event(struct xhci_hcd *xhci,
 	if (!ep_ring)
 		return handle_transferless_tx_event(xhci, ep, trb_comp_code);
 
+	snprintf(&ep_ring->log_b[ep_ring->log_i++][0], 100,
+			"event %px ep_trb_dma %llx comp_code %d len %d slot %d ep %d",
+			event, ep_trb_dma, trb_comp_code, EVENT_TRB_LEN(le32_to_cpu(event->transfer_len)), slot_id, ep_index);
+	ep_ring->log_i %= 5;
+
 	/* Look for common error cases */
 	switch (trb_comp_code) {
 	/* Skip codes that require special handling depending on
@@ -2863,6 +2879,11 @@ static int handle_tx_event(struct xhci_hcd *xhci,
 					"comp_code %u\n", ep_index,
 					trb_comp_code);
 				trb_in_td(xhci, td, ep_trb_dma, true);
+				xhci_info(xhci, "last xhci_td_cleanup: first_dma %llx last_dma %llx status %d from %s\n",
+						ep_ring->cleanup_first_dma, ep_ring->cleanup_last_dma,
+						ep_ring->cleanup_status, ep_ring->cleanup_from);
+				for (int i = 0; i < 5; i++)
+					xhci_info(xhci, "handle_tx_event log %2d: %s\n", i-4, &ep_ring->log_b[(ep_ring->log_i + i) % 5][0]);
 
 				return -ESHUTDOWN;
 			}
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index ebd0afd59a60..cec743d86a3b 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -1361,6 +1361,14 @@ struct xhci_ring {
 	enum xhci_ring_type	type;
 	bool			last_td_was_short;
 	struct radix_tree_root	*trb_address_map;
+
+	dma_addr_t		cleanup_first_dma;
+	dma_addr_t		cleanup_last_dma;
+	const char		*cleanup_from;
+	int			cleanup_status;
+
+	int			log_i;
+	char			log_b[5][100];
 };
 
 struct xhci_erst_entry {

[Index of Archives]     [Linux Media]     [Linux Input]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [Old Linux USB Devel Archive]

  Powered by Linux