Re: [RFT v2] xHCI: cancel command queued in the command ring

Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx> · Fri, 01 Jun 2012 14:39:05 +0300

Elric Fu <elricfu1@xxxxxxxxx> writes:

> Software have to abort command ring and cancel command
> when a command is failed or hang. Otherwise, the command
> ring will hang up and can't handle the others. An example
> of a command that may hang is the Address Device Command,
> because waiting for a SET_ADDRESS request to be acknowledged
> by a USB device is outside of the xHC's ability to control.
>
> According to xHCI spec section 4.6.1.1 and section 4.6.1.2,
> after aborting a command on the command ring, xHC will
> generate a command completion event with its completion code
> set to Command Ring Stopped at least. If a command is
> currently executing at the time of aborting a command, xHC
> also generate a command completion event with its completion
> code set to Command Abort. When the command ring is stopped,
> software may remove, add, or rearrage Command Descriptors.
>
> The RFT patch is used to cancel command when the above
> situation occur.
>
> To cancel a command, software will initialize a command
> descriptor for the cancel command, and add it into a
> cancel_cmd_list of xhci. When the command ring is stopped,
> software will find the command trbs described by command
> descriptors in cancel_cmd_list and modify it to No Op
> command. If software can't find the matched trbs, we can
> think it had been finished.
>
> Signed-off-by: Elric Fu <elricfu1@xxxxxxxxx>
> Acked-by: Andiry Xu <Andiry.Xu@xxxxxxx>
> ---
>
> Hi Sarah, 
>    
> please test it in your old VIA USB3.0 hub. I can't duplicate
> the case of the address device command timeout in my VIA
> USB3.0 hub. I heard that the GENESYS GL3310 sata bridge would
> respond the address device command after a long time when it
> was plugged into VIA xHCI host. And the issue will occur after
> several quick disconnect/connect cycles. Yesterday I finally
> got a GL3310 device. But it is ok and responds the address 
> device command very quickly. I can't duplicate it.
>
> In additional, any comment about the patch?

Hi,

I've tested this patch (it requires some rediffing) with a UDC that I
hacked to ignore SET_ADDRESS requests and it does fix the problem. Some
comments below.

[snip]
> +int xhci_abort_cmd_ring(struct xhci_hcd *xhci)

Shouldn't this be static?

> +{
> +	u64 temp_64;
> +	int ret;
> +
> +	xhci_dbg(xhci, "Abort command ring\n");
> +
> +	if (!(xhci->cmd_ring_state & CMD_RING_STATE_RUNNING)) {
> +		xhci_dbg(xhci, "The command ring isn't running, "
> +				"Have the command ring been stopped?");
> +		return -EINVAL;
> +	}
> +
> +	temp_64 = xhci_read_64(xhci, &xhci->op_regs->cmd_ring);
> +	if (!(temp_64 & CMD_RING_RUNNING)) {
> +		xhci_dbg(xhci, "Command ring had been stopped\n");
> +		return -EINVAL;
> +	}
> +	xhci->cmd_ring_state = CMD_RING_STATE_ABORTED;
> +	xhci_write_64(xhci, temp_64 | CMD_RING_ABORT,
> +			&xhci->op_regs->cmd_ring);
> +
> +	/* Section 4.6.1.2 of xHCI 1.0 spec says software should
> +	 * time the completion od all xHCI commands, including
> +	 * the Command Abort operation. If software doesn't see
> +	 * CRR negated in a timely manner (e.g. longer than 5
> +	 * seconds), then it should assume that the there are
> +	 * larger problems with the xHC and assert HCRST.
> +	 */
> +	ret = handshake(xhci, &xhci->op_regs->cmd_ring,
> +			CMD_RING_RUNNING, 0, 5 * 1000 * 1000);
> +	if (ret < 0) {
> +		xhci_err(xhci, "Stopped the command ring failed, "
> +				"maybe the host is dead\n");
> +		xhci->xhc_state |= XHCI_STATE_DYING;
> +		xhci_quiesce(xhci);
> +		xhci_halt(xhci);
> +		return -ESHUTDOWN;
> +	}
> +
> +	return 0;
> +}
> +
> +static int queue_cd(struct xhci_hcd *xhci, int slot_id,
> +		u32 cmd_type, int ep_index)
> +{
> +	struct xhci_cd *cd;
> +
> +	cd = kzalloc(sizeof(struct xhci_cd), GFP_ATOMIC);
> +	if (!cd)
> +		return -ENOMEM;
> +	INIT_LIST_HEAD(&cd->cancel_cmd_list);
> +
> +	cd->slot_id = slot_id;
> +	cd->cmd_type = cmd_type;
> +	cd->ep_index = ep_index;
> +	list_add_tail(&cd->cancel_cmd_list, &xhci->cancel_cmd_list);
> +
> +	return 0;
> +}
> +
> +int xhci_cancel_cmd(struct xhci_hcd *xhci, int slot_id,
> +		u32 cmd_type, int ep_index)
> +{
> +	int retval;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&xhci->lock, flags);

It looks like xhci_abort_cmd_ring() can race with xhci_ring_cmd_db(),
since there doesn't seem to be a requirement for the latter to be called
under &xhci->lock. I found one case of this in xhci_run_finished() for
controllers with NEC quirk enabled.

> +
> +	if (xhci->xhc_state & XHCI_STATE_DYING) {
> +		xhci_warn(xhci, "Abort the command ring,"
> +				" but the xHCI is dead.\n");
> +		spin_unlock_irqrestore(&xhci->lock, flags);
> +		return -ESHUTDOWN;
> +	}
> +
> +	/* queue the cmd desriptor to cancelled_cd_list */
> +	retval = queue_cd(xhci, slot_id, cmd_type, ep_index);
> +	if (retval) {
> +		spin_unlock_irqrestore(&xhci->lock, flags);
> +		return retval;
> +	}
> +
> +	/* abort command ring */
> +	retval = xhci_abort_cmd_ring(xhci);
> +	if (retval) {
> +		xhci_err(xhci, "Abort command ring failed\n");
> +		spin_unlock_irqrestore(&xhci->lock, flags);
> +		if (retval == -ESHUTDOWN) {
> +			usb_hc_died(xhci_to_hcd(xhci)->primary_hcd);
> +			xhci_dbg(xhci, "xHCI host controller is dead.\n");
> +		}
> +		return retval;
> +	}
> +
> +	spin_unlock_irqrestore(&xhci->lock, flags);
> +	return 0;
> +}
> +
>  void xhci_ring_ep_doorbell(struct xhci_hcd *xhci,
>  		unsigned int slot_id,
>  		unsigned int ep_index,
> @@ -1050,6 +1152,127 @@ static int handle_cmd_in_cmd_wait_list(struct xhci_hcd *xhci,
>  	return 1;
>  }
>  
> +/*
> + * Find the command trb need to be cancelled and modify it to NO OP
> + * command.
> + *
> + * If we can't find the command trb, we think it had already been
> + * executed.
> + */
> +static void cd_to_noop(struct xhci_hcd *xhci, struct xhci_cd *cur_cd)
> +{
> +	struct xhci_segment *cur_seg;
> +	union xhci_trb *cmd_trb;
> +	int slot_id;
> +	u32 cmd_type;
> +	u32 cycle_state;
> +	int ep_index;
> +
> +	if (xhci->cmd_ring->dequeue == xhci->cmd_ring->enqueue)
> +		return;
> +
> +	/* find the current segmene of command ring */
> +	cur_seg = find_trb_seg(xhci->cmd_ring->first_seg,
> +			xhci->cmd_ring->dequeue, &cycle_state);
> +
> +	/* find the command trb matched by CD from command ring */
> +	for (cmd_trb = xhci->cmd_ring->dequeue;
> +			cmd_trb != xhci->cmd_ring->enqueue;
> +			next_trb(xhci, xhci->cmd_ring, &cur_seg, &cmd_trb)) {
> +		if (TRB_TYPE_LINK_LE32(cmd_trb->generic.field[3]))
> +			continue;
> +
> +		slot_id = TRB_TO_SLOT_ID(
> +				le32_to_cpu(cmd_trb->generic.field[3]));
> +		cmd_type = TRB_FIELD_TO_TYPE(
> +				le32_to_cpu(cmd_trb->generic.field[3]));
> +		if ((slot_id == cur_cd->slot_id) &&
> +				(cmd_type == cur_cd->cmd_type)) {
> +
> +			/* The ep_index in the command desriptors of Stop
> +			 * Endpoint command, Set TR Dequeue Pointer command
> +			 * and Reset Endpoint command is vaild. so it should
> +			 * be match the ep_index of cd.
> +			 */
> +			if ((cmd_type == TRB_STOP_RING) ||
> +					(cmd_type == TRB_SET_DEQ) ||
> +					(cmd_type == TRB_RESET_EP)) {
> +				ep_index = TRB_TO_EP_INDEX(le32_to_cpu(
> +						cmd_trb->generic.field[3]));
> +				if (ep_index != cur_cd->ep_index)
> +					continue;
> +			}
> +
> +			/* get cycle state from the origin command trb */
> +			cycle_state = le32_to_cpu(cmd_trb->generic.field[3])
> +				& TRB_CYCLE;
> +
> +			/* modify the command trb to NO OP command */
> +			cmd_trb->generic.field[0] = 0;
> +			cmd_trb->generic.field[1] = 0;
> +			cmd_trb->generic.field[2] = 0;
> +			cmd_trb->generic.field[3] = cpu_to_le32(
> +					TRB_TYPE(TRB_CMD_NOOP) | cycle_state);
> +			break;
> +		}
> +	}
> +}
> +
> +static void cancel_cd(struct xhci_hcd *xhci)
> +{
> +	struct xhci_cd *cur_cd, *next_cd;
> +
> +	if (list_empty(&xhci->cancel_cmd_list))
> +		return;
> +
> +	list_for_each_entry_safe(cur_cd, next_cd,
> +			&xhci->cancel_cmd_list, cancel_cmd_list) {
> +		xhci_dbg(xhci, "Cancel command, slot %d, cmd type %d, "
> +				"ep_index = %d\n", cur_cd->slot_id,
> +				cur_cd->cmd_type, cur_cd->ep_index);
> +		cd_to_noop(xhci, cur_cd);
> +
> +		/* Whatever we find the matched command trb, we need to
> +		 * release the command descriptor.
> +		 */
> +		list_del(&cur_cd->cancel_cmd_list);
> +		kfree(cur_cd);
> +	}
> +}
> +
> +static void handle_stopped_cmd_ring(struct xhci_hcd *xhci,
> +		struct xhci_event_cmd *event)
> +{
> +	struct xhci_virt_device *virt_dev;
> +	int slot_id = TRB_TO_SLOT_ID(
> +			le32_to_cpu(xhci->cmd_ring->dequeue->generic.field[3]));
> +	int cmd_trb_comp_code = GET_COMP_CODE(le32_to_cpu(event->status));
> +
> +	virt_dev = xhci->devs[slot_id];
> +	if (virt_dev) {
> +		/* if the command is in cmd_list, handle it. */
> +		handle_cmd_in_cmd_wait_list(xhci, virt_dev, event);
> +	}
> +
> +	/* advance the dequeue pointer to next cmd trb */
> +	inc_deq(xhci, xhci->cmd_ring, false);
> +
> +	/* set the cmd ring state */
> +	if (cmd_trb_comp_code != COMP_CMD_STOP) {
> +		xhci->cmd_ring_state = CMD_RING_STATE_STOPPED;
> +	} else {
> +		cancel_cd(xhci);
> +		xhci->cmd_ring_state = CMD_RING_STATE_RUNNING;
> +
> +		/*
> +		 * ring command ring doorbell again to handle the waiting
> +		 * command trbs due to aborting command ring
> +		 */
> +		if (xhci->cmd_ring->dequeue != xhci->cmd_ring->enqueue)
> +			xhci_ring_cmd_db(xhci);
> +	}
> +}
> +
>  static void handle_cmd_completion(struct xhci_hcd *xhci,
>  		struct xhci_event_cmd *event)
>  {
> @@ -1075,6 +1298,13 @@ static void handle_cmd_completion(struct xhci_hcd *xhci,
>  		xhci->error_bitmask |= 1 << 5;
>  		return;
>  	}
> +
> +	if ((GET_COMP_CODE(le32_to_cpu(event->status)) == COMP_CMD_ABORT) ||
> +		(GET_COMP_CODE(le32_to_cpu(event->status)) == COMP_CMD_STOP)) {
> +		handle_stopped_cmd_ring(xhci, event);
> +		return;
> +	}
> +
>  	switch (le32_to_cpu(xhci->cmd_ring->dequeue->generic.field[3])
>  		& TRB_TYPE_BITMASK) {
>  	case TRB_TYPE(TRB_ENABLE_SLOT):
> diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
> index c939f5f..9434771 100644
> --- a/drivers/usb/host/xhci.c
> +++ b/drivers/usb/host/xhci.c
> @@ -51,7 +51,7 @@ MODULE_PARM_DESC(link_quirk, "Don't clear the chain bit on a link TRB");
>   * handshake done).  There are two failure modes:  "usec" have passed (major
>   * hardware flakeout), or the register reads as all-ones (hardware removed).
>   */
> -static int handshake(struct xhci_hcd *xhci, void __iomem *ptr,
> +int handshake(struct xhci_hcd *xhci, void __iomem *ptr,
>  		      u32 mask, u32 done, int usec)
>  {
>  	u32	result;
> @@ -104,8 +104,10 @@ int xhci_halt(struct xhci_hcd *xhci)
>  
>  	ret = handshake(xhci, &xhci->op_regs->status,
>  			STS_HALT, STS_HALT, XHCI_MAX_HALT_USEC);
> -	if (!ret)
> +	if (!ret) {
>  		xhci->xhc_state |= XHCI_STATE_HALTED;
> +		xhci->cmd_ring_state = CMD_RING_STATE_STOPPED;
> +	}
>  	return ret;
>  }
>  
> @@ -473,6 +475,7 @@ static int xhci_run_finished(struct xhci_hcd *xhci)
>  		return -ENODEV;
>  	}
>  	xhci->shared_hcd->state = HC_STATE_RUNNING;
> +	xhci->cmd_ring_state = CMD_RING_STATE_RUNNING;
>  
>  	if (xhci->quirks & XHCI_NEC_HOST)
>  		xhci_ring_cmd_db(xhci);
> @@ -3542,7 +3545,10 @@ int xhci_address_device(struct usb_hcd *hcd, struct usb_device *udev)
>  	if (timeleft <= 0) {
>  		xhci_warn(xhci, "%s while waiting for address device command\n",
>  				timeleft == 0 ? "Timeout" : "Signal");
> -		/* FIXME cancel the address device command */
> +		/* cancel the address device command */
> +		ret = xhci_cancel_cmd(xhci, udev->slot_id, TRB_ADDR_DEV, 0);
> +		if (ret < 0)
> +			return ret;
>  		return -ETIME;
>  	}
>  
> diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
> index fb99c83..03ff6ab 100644
> --- a/drivers/usb/host/xhci.h
> +++ b/drivers/usb/host/xhci.h
> @@ -1247,6 +1247,14 @@ struct xhci_td {
>  	union xhci_trb		*last_trb;
>  };
>  
> +/* command descriptor */
> +struct xhci_cd {
> +	struct list_head        cancel_cmd_list;
> +	int                     slot_id;
> +	u32                     cmd_type;
> +	int			ep_index;
> +};
> +
>  struct xhci_dequeue_state {
>  	struct xhci_segment *new_deq_seg;
>  	union xhci_trb *new_deq_ptr;
> @@ -1394,6 +1402,11 @@ struct xhci_hcd {
>  	/* data structures */
>  	struct xhci_device_context_array *dcbaa;
>  	struct xhci_ring	*cmd_ring;
> +	unsigned int		cmd_ring_state;
> +#define CMD_RING_STATE_RUNNING		(1 << 0)
> +#define CMD_RING_STATE_ABORTED		(1 << 1)
> +#define CMD_RING_STATE_STOPPED		(1 << 2)

I don't think you want a bit mask here, since you can't reside in more
than one of these states at any given point in time. Can it be a enum
instead?

Regards,
--
Alex
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html