Re: [PATCH v4 0/3] xhci: Fix Stop Endpoint problems

Michał Pecio <michal.pecio@xxxxxxxxx> · Tue, 5 Nov 2024 13:18:12 +0100

On Tue, 5 Nov 2024 13:05:19 +0200, Mathias Nyman wrote:
> Out of curiosity, what was the longest needed 'stop endpoint' retry
> time you saw which still had a successful outcome?

On NEC, periodic endpoints appear to take up to one ESIT. It's one or
two retries on fast isochronous EPs, but interrupt may take a while.

The longest I have seen was 24ms on a SuperSpeed EP with bInterval 11,
so it looks like this may be the chip's limit (bInterval 11 = 128ms),
but admittedly, I haven't tested this thoroughly with many devices.

No problems ever seen with bulk or control on NEC.

On ASM3142 it seems that one retry tends to be enough, and IIRC all
endpoint types can misbehave.


I attached a patch, based on the earlier "redundant" patch, which tries
to detect these bugs by looping until it sees EP Context state change.
Looping under spinlock sure is dodgy, but with these timeout values it
doesn't seem to completely break the driver and it finds these HC bugs.

And yes, I have seen the "absolutely fubar" case on ASM1042 with bad
cable (repeated halts, resets and stops). Some time later it "died".

Regards,
Michal

diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index b6eb928e260f..85289f09ca18 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -487,6 +487,27 @@ static int xhci_abort_cmd_ring(struct xhci_hcd *xhci, unsigned long flags)
 	return 0;
 }
 
+static bool busywait(struct xhci_hcd *xhci, struct xhci_virt_ep *ep, int timeout_ms)
+{
+	struct xhci_ep_ctx *ep_ctx = xhci_get_ep_ctx(xhci, ep->vdev->out_ctx, ep->ep_index);
+	u64 t2, t1 = ktime_get_ns();
+
+	do {
+		t2 = ktime_get_ns();
+		if (GET_EP_CTX_STATE(READ_ONCE(ep_ctx)) == EP_STATE_RUNNING) {
+			xhci_info(xhci, "slot %d ep %d busy wait found ctx_state RUNNING after %lldus\n",
+					ep->vdev->slot_id, ep->ep_index,
+					(t2 - t1) / 1000);
+			return true;
+		}
+	} while (t2 < t1 + timeout_ms * 1000000);
+
+	xhci_info(xhci, "slot %d ep %d busy wait gave up after %lldus\n",
+			ep->vdev->slot_id, ep->ep_index,
+			(t2 - t1) / 1000);
+	return false;
+}
+
 void xhci_ring_ep_doorbell(struct xhci_hcd *xhci,
 		unsigned int slot_id,
 		unsigned int ep_index,
@@ -1114,6 +1135,20 @@ static void xhci_handle_cmd_stop_ep(struct xhci_hcd *xhci, int slot_id,
 		return;
 
 	ep_ctx = xhci_get_ep_ctx(xhci, ep->vdev->out_ctx, ep_index);
+	int ctx_state = GET_EP_CTX_STATE(READ_ONCE(ep_ctx));
+
+	if (ep->stop_retry)
+		xhci_info(xhci, comp_code == COMP_SUCCESS ?
+				"it worked!\n" : "it failed %d\n", comp_code);
+	ep->stop_retry = false;
+
+	if (comp_code != COMP_SUCCESS && comp_code != COMP_CONTEXT_STATE_ERROR)
+		xhci_err(xhci, "slot %d ep %d WTF BUG in ep_state 0x%x\n",
+				slot_id, ep_index, ep->ep_state);
+
+	if (comp_code == COMP_SUCCESS && busywait(xhci, ep, 3))
+		xhci_err(xhci, "slot %d ep %d ABSOLUTELY FUBAR BUG in ep_state 0x%x ctx_state %d\n",
+				slot_id, ep_index, ep->ep_state, ctx_state);
 
 	trace_xhci_handle_cmd_stop_ep(ep_ctx);
 
@@ -1132,7 +1167,7 @@ static void xhci_handle_cmd_stop_ep(struct xhci_hcd *xhci, int slot_id,
 	 * next transfer, which then will return -EPIPE, and device side stall is
 	 * noted and cleared by class driver.
 	 */
-		switch (GET_EP_CTX_STATE(ep_ctx)) {
+		switch (ctx_state) {
 		case EP_STATE_HALTED:
 			xhci_dbg(xhci, "Stop ep completion raced with stall, reset ep\n");
 			if (ep->ep_state & EP_HAS_STREAMS) {
@@ -1149,19 +1184,55 @@ static void xhci_handle_cmd_stop_ep(struct xhci_hcd *xhci, int slot_id,
 				break;
 			ep->ep_state &= ~EP_STOP_CMD_PENDING;
 			return;
+
 		case EP_STATE_STOPPED:
+			bool predicted = true;
 			/*
-			 * NEC uPD720200 sometimes sets this state and fails with
-			 * Context Error while continuing to process TRBs.
-			 * Be conservative and trust EP_CTX_STATE on other chips.
+			 * Per xHCI 4.6.9, Stop Endpoint command on a Stopped
+			 * EP is a Context State Error, and EP stays Stopped.
+			 * This outcome is valid if the command was redundant.
 			 */
-			if (!(xhci->quirks & XHCI_NEC_HOST))
-				break;
-			fallthrough;
+			if (ep->ep_state & EP_STOP_CMD_REDUNDANT)
+				predicted = false;
+			/*
+			 * Or it really failed on Halted, but somebody ran Reset
+			 * Endpoint later. EP state is now Stopped and EP_HALTED
+			 * still set because Reset EP handler will run after us.
+			 */
+			if (ep->ep_state & EP_HALTED)
+				predicted = false;
+			/*
+			 * On some HCs EP state remains Stopped for some tens of
+			 * us to a few ms or more after a doorbell ring, and any
+			 * new Stop Endpoint fails without aborting the restart.
+			 * This handler may run quickly enough to still see this
+			 * Stopped state, but it will soon change to Running.
+			 *
+			 * Assume this bug on unexpected Stop Endpoint failures.
+			 * Keep retrying until the EP starts and stops again, on
+			 * chips known to have the bug and to react positively.
+			 */
+			if (busywait(xhci, ep, predicted? 30: 3)) {
+				xhci_err(xhci, "slot %d ep %d %sSTOPPED BUG in ep_state 0x%x\n",
+						slot_id, ep_index, predicted? "": "UNPREDICTED ",
+						ep->ep_state);
+				goto retry;
+			}
+			if (predicted)
+				xhci_err(xhci, "slot %d ep %d MISPREDICTED STOPPED BUG in ep_state 0x%x (busywait too short?)\n",
+						slot_id, ep_index, ep->ep_state);
+			else
+				xhci_info(xhci, "slot %d ep %d not a bug predicted in ep_state 0x%x\n",
+						slot_id, ep_index, ep->ep_state);
+			break;
+
 		case EP_STATE_RUNNING:
 			/* Race, HW handled stop ep cmd before ep was running */
-			xhci_dbg(xhci, "Stop ep completion ctx error, ep is running\n");
-
+			xhci_err(xhci, "slot %d ep %d RUNNING BUG in ep_state %d\n",
+					slot_id, ep_index, ep->ep_state);
+retry:
+			xhci_info(xhci, "stop again and see what happens...\n");
+			ep->stop_retry = true;
 			command = xhci_alloc_command(xhci, false, GFP_ATOMIC);
 			if (!command) {
 				ep->ep_state &= ~EP_STOP_CMD_PENDING;
@@ -4375,11 +4446,31 @@ int xhci_queue_evaluate_context(struct xhci_hcd *xhci, struct xhci_command *cmd,
 int xhci_queue_stop_endpoint(struct xhci_hcd *xhci, struct xhci_command *cmd,
 			     int slot_id, unsigned int ep_index, int suspend)
 {
+	struct xhci_virt_ep *ep;
 	u32 trb_slot_id = SLOT_ID_FOR_TRB(slot_id);
 	u32 trb_ep_index = EP_INDEX_FOR_TRB(ep_index);
 	u32 type = TRB_TYPE(TRB_STOP_RING);
 	u32 trb_suspend = SUSPEND_PORT_FOR_TRB(suspend);
 
+	/*
+	 * Any of that means the EP is or will be Stopped by the time this
+	 * command runs. Queue it anyway for its side effects like calling
+	 * our default handler or complete(). But our handler must know if
+	 * the command is redundant, so check it now. The handler can't do
+	 * it later because those operations may finish before it runs.
+	 */
+	ep = xhci_get_virt_ep(xhci, slot_id, ep_index);
+	if (ep) {
+		if (ep->ep_state & (SET_DEQ_PENDING | EP_HALTED | EP_CLEARING_TT))
+			ep->ep_state |= EP_STOP_CMD_REDUNDANT;
+		else
+			ep->ep_state &= ~EP_STOP_CMD_REDUNDANT;
+		if (ep->ep_state & EP_STOP_CMD_REDUNDANT)
+			xhci_info(xhci, "slot %d ep %d queuing a redundant Stop Endpoint in ep_state 0x%x\n",
+					slot_id, ep_index, ep->ep_state);
+	}
+	/* else: don't care, the handler will do nothing anyway */
+
 	return queue_command(xhci, cmd, 0, 0, 0,
 			trb_slot_id | trb_ep_index | type | trb_suspend, false);
 }
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index f0fb696d5619..05555ca4f38a 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -670,6 +670,8 @@ struct xhci_virt_ep {
 #define EP_SOFT_CLEAR_TOGGLE	(1 << 7)
 /* usb_hub_clear_tt_buffer is in progress */
 #define EP_CLEARING_TT		(1 << 8)
+/* queued Stop Endpoint is expected to fail */
+#define EP_STOP_CMD_REDUNDANT	(1 << 9)
 	/* ----  Related to URB cancellation ---- */
 	struct list_head	cancelled_td_list;
 	struct xhci_hcd		*xhci;
@@ -694,6 +696,8 @@ struct xhci_virt_ep {
 	int			next_frame_id;
 	/* Use new Isoch TRB layout needed for extended TBC support */
 	bool			use_extended_tbc;
+
+	bool			stop_retry;
 };
 
 enum xhci_overhead_type {