On 16.11.2016 11:50, OGAWA Hirofumi wrote:
Current abort operation has race. xhci_handle_command_timeout() xhci_abort_cmd_ring() xhci_write_64(CMD_RING_ABORT) xhci_handshake(5s) do { check CMD_RING_RUNNING udelay(1) ... COMP_CMD_ABORT event COMP_CMD_STOP event xhci_handle_stopped_cmd_ring() restart cmd_ring CMD_RING_RUNNING become 1 again } while () return -ETIMEDOUT xhci_write_64(CMD_RING_ABORT) /* can abort random command */ To do abort operation correctly, we have to wait both of COMP_CMD_STOP event and negation of CMD_RING_RUNNING. But like above, while timeout handler is waiting negation of CMD_RING_RUNNING, event handler can restart cmd_ring. So timeout handler never be notice negation of CMD_RING_RUNNING, and retry of CMD_RING_ABORT can abort random command (BTW, I guess retry of CMD_RING_ABORT was workaround of this race).
Nice catch Didn't see that race, with the possible abortion of a random command. The retry was a workaround for a case triggered on v4.1.4 by Vincent Pelletier (added to CC) http://marc.info/?l=linux-usb&m=144031185222108&w=2 The race could very well explain that issue.
To fix this race, this moves xhci_handle_stopped_cmd_ring() to xhci_abort_cmd_ring(). And timeout handler waits COMP_CMD_STOP event.
Sounds reasonable
At this point, timeout handler is owner of cmd_ring, and safely restart cmd_ring by using xhci_handle_stopped_cmd_ring(). [FWIW, as bonus, this way would be easily extend to add CMD_RING_PAUSE operation] Signed-off-by: OGAWA Hirofumi <hirofumi@xxxxxxxxxxxxxxxxxx> --- drivers/usb/host/xhci-mem.c | 1 drivers/usb/host/xhci-ring.c | 162 ++++++++++++++++++++++-------------------- drivers/usb/host/xhci.h | 1 3 files changed, 87 insertions(+), 77 deletions(-) diff -puN drivers/usb/host/xhci-mem.c~xhci-fix-abort-race2 drivers/usb/host/xhci-mem.c --- xhci/drivers/usb/host/xhci-mem.c~xhci-fix-abort-race2 2016-11-16 18:38:52.551146764 +0900 +++ xhci-hirofumi/drivers/usb/host/xhci-mem.c 2016-11-16 18:38:52.553146763 +0900 @@ -2344,6 +2344,7 @@ int xhci_mem_init(struct xhci_hcd *xhci, /* init command timeout work */ INIT_DELAYED_WORK(&xhci->cmd_timer, xhci_handle_command_timeout); + init_completion(&xhci->stop_completion); page_size = readl(&xhci->op_regs->page_size); xhci_dbg_trace(xhci, trace_xhci_dbg_init, diff -puN drivers/usb/host/xhci-ring.c~xhci-fix-abort-race2 drivers/usb/host/xhci-ring.c --- xhci/drivers/usb/host/xhci-ring.c~xhci-fix-abort-race2 2016-11-16 18:38:52.552146764 +0900 +++ xhci-hirofumi/drivers/usb/host/xhci-ring.c 2016-11-16 18:39:07.027136278 +0900 @@ -284,6 +284,60 @@ static bool xhci_mod_cmd_timer(struct xh return mod_delayed_work(system_wq, &xhci->cmd_timer, delay); } +static struct xhci_command *xhci_next_queued_cmd(struct xhci_hcd *xhci) +{ + return list_first_entry_or_null(&xhci->cmd_list, struct xhci_command, + cmd_list); +} + +/* + * Turn all commands on command ring with status set to "aborted" to no-op trbs. + * If there are other commands waiting then restart the ring and kick the timer. + * This must be called with command ring stopped and xhci->lock held. + */ +static void xhci_handle_stopped_cmd_ring(struct xhci_hcd *xhci, + struct xhci_command *cur_cmd) +{ + struct xhci_command *i_cmd; + u32 cycle_state; + + /* Turn all aborted commands in list to no-ops, then restart */ + list_for_each_entry(i_cmd, &xhci->cmd_list, cmd_list) { + + if (i_cmd->status != COMP_CMD_ABORT) + continue; + + i_cmd->status = COMP_CMD_STOP; + + xhci_dbg(xhci, "Turn aborted command %p to no-op\n", + i_cmd->command_trb); + /* get cycle state from the original cmd trb */ + cycle_state = le32_to_cpu( + i_cmd->command_trb->generic.field[3]) & TRB_CYCLE; + /* modify the command trb to no-op command */ + i_cmd->command_trb->generic.field[0] = 0; + i_cmd->command_trb->generic.field[1] = 0; + i_cmd->command_trb->generic.field[2] = 0; + i_cmd->command_trb->generic.field[3] = cpu_to_le32( + TRB_TYPE(TRB_CMD_NOOP) | cycle_state); + + /* + * caller waiting for completion is called when command + * completion event is received for these no-op commands + */ + } + + xhci->cmd_ring_state = CMD_RING_STATE_RUNNING; + + /* ring command ring doorbell to restart the command ring */ + if ((xhci->cmd_ring->dequeue != xhci->cmd_ring->enqueue) && + !(xhci->xhc_state & XHCI_STATE_DYING)) { + xhci->current_cmd = cur_cmd; + xhci_mod_cmd_timer(xhci, XHCI_CMD_DEFAULT_TIMEOUT); + xhci_ring_cmd_db(xhci); + } +} + static int xhci_abort_cmd_ring(struct xhci_hcd *xhci) { u64 temp_64; @@ -291,16 +345,9 @@ static int xhci_abort_cmd_ring(struct xh xhci_dbg(xhci, "Abort command ring\n"); - temp_64 = xhci_read_64(xhci, &xhci->op_regs->cmd_ring); - xhci->cmd_ring_state = CMD_RING_STATE_ABORTED; + reinit_completion(&xhci->stop_completion); - /* - * Writing the CMD_RING_ABORT bit should cause a cmd completion event, - * however on some host hw the CMD_RING_RUNNING bit is correctly cleared - * but the completion event in never sent. Use the cmd timeout timer to - * handle those cases. Use twice the time to cover the bit polling retry - */ - xhci_mod_cmd_timer(xhci, 2 * XHCI_CMD_DEFAULT_TIMEOUT); + temp_64 = xhci_read_64(xhci, &xhci->op_regs->cmd_ring); xhci_write_64(xhci, temp_64 | CMD_RING_ABORT, &xhci->op_regs->cmd_ring); @@ -320,15 +367,29 @@ static int xhci_abort_cmd_ring(struct xh udelay(1000); ret = xhci_handshake(&xhci->op_regs->cmd_ring, CMD_RING_RUNNING, 0, 3 * 1000 * 1000); - if (ret == 0) - return 0; + if (ret < 0) { + xhci_err(xhci, "Stopped the command ring failed, " + "maybe the host is dead\n"); + xhci->xhc_state |= XHCI_STATE_DYING; + xhci_halt(xhci); + return -ESHUTDOWN; + } + }
So if we can verify that the race fix solves the old issue for Vincent Pelletier then we could get rid of the abort retry above as well.
- xhci_err(xhci, "Stopped the command ring failed, " - "maybe the host is dead\n"); - cancel_delayed_work(&xhci->cmd_timer); - xhci->xhc_state |= XHCI_STATE_DYING; - xhci_halt(xhci); - return -ESHUTDOWN; + /* + * Writing the CMD_RING_ABORT bit should cause a cmd completion event, + * however on some host hw the CMD_RING_RUNNING bit is correctly cleared + * but the completion event in never sent. Wait 2 secs (arbitrary + * number) to handle those cases after negation of CMD_RING_RUNNING. + */
I'm speculating a bit here, but as we now can sleep, and if we could get rid of the abort retry couldn't we swap the order of the xhci_handshake(CMD_RING_RUNNING) busywait and wait_for_completion_timeout(). We could also use a standard command timeout time while waiting for the completion something like: hci_write_64(CMD_RING_ABORT) timed_out = wait_for_completion_timeout(XHCI_CMD_DEFAULT_TIMEOUT) xhci_handshake(CMD_RING_RUNNING) if (handshake fail) { xhci_halt(), etc.. return -ESHUTDOWN } if (timed out) { xhci_cleanup_command_queue(xhci); return } It would reduce the time we spend in the xhci_handshake() busyloop
+ if (!wait_for_completion_timeout(&xhci->stop_completion, 2 * HZ)) { + xhci_dbg(xhci, "No stop event for abort, ring start fail?\n"); + xhci_cleanup_command_queue(xhci); + } else { + unsigned long flags; + spin_lock_irqsave(&xhci->lock, flags); + xhci_handle_stopped_cmd_ring(xhci, xhci_next_queued_cmd(xhci)); + spin_unlock_irqrestore(&xhci->lock, flags); } return 0; @@ -1212,79 +1273,26 @@ void xhci_cleanup_command_queue(struct x xhci_complete_del_and_free_cmd(cur_cmd, COMP_CMD_ABORT); } -/* - * Turn all commands on command ring with status set to "aborted" to no-op trbs. - * If there are other commands waiting then restart the ring and kick the timer. - * This must be called with command ring stopped and xhci->lock held. - */ -static void xhci_handle_stopped_cmd_ring(struct xhci_hcd *xhci, - struct xhci_command *cur_cmd) -{ - struct xhci_command *i_cmd, *tmp_cmd; - u32 cycle_state; - - /* Turn all aborted commands in list to no-ops, then restart */ - list_for_each_entry_safe(i_cmd, tmp_cmd, &xhci->cmd_list, - cmd_list) { - - if (i_cmd->status != COMP_CMD_ABORT) - continue; - - i_cmd->status = COMP_CMD_STOP; - - xhci_dbg(xhci, "Turn aborted command %p to no-op\n", - i_cmd->command_trb); - /* get cycle state from the original cmd trb */ - cycle_state = le32_to_cpu( - i_cmd->command_trb->generic.field[3]) & TRB_CYCLE; - /* modify the command trb to no-op command */ - i_cmd->command_trb->generic.field[0] = 0; - i_cmd->command_trb->generic.field[1] = 0; - i_cmd->command_trb->generic.field[2] = 0; - i_cmd->command_trb->generic.field[3] = cpu_to_le32( - TRB_TYPE(TRB_CMD_NOOP) | cycle_state); - - /* - * caller waiting for completion is called when command - * completion event is received for these no-op commands - */ - } - - xhci->cmd_ring_state = CMD_RING_STATE_RUNNING; - - /* ring command ring doorbell to restart the command ring */ - if ((xhci->cmd_ring->dequeue != xhci->cmd_ring->enqueue) && - !(xhci->xhc_state & XHCI_STATE_DYING)) { - xhci->current_cmd = cur_cmd; - xhci_mod_cmd_timer(xhci, XHCI_CMD_DEFAULT_TIMEOUT); - xhci_ring_cmd_db(xhci); - } - return; -} - - void xhci_handle_command_timeout(struct work_struct *work) { struct xhci_hcd *xhci; int ret; unsigned long flags; u64 hw_ring_state; - bool second_timeout = false; xhci = container_of(to_delayed_work(work), struct xhci_hcd, cmd_timer); /* mark this command to be cancelled */ spin_lock_irqsave(&xhci->lock, flags); - if (xhci->current_cmd) { - if (xhci->current_cmd->status == COMP_CMD_ABORT) - second_timeout = true; + if (xhci->current_cmd) xhci->current_cmd->status = COMP_CMD_ABORT; - } /* Make sure command ring is running before aborting it */ hw_ring_state = xhci_read_64(xhci, &xhci->op_regs->cmd_ring); if ((xhci->cmd_ring_state & CMD_RING_STATE_RUNNING) && (hw_ring_state & CMD_RING_RUNNING)) { + /* Prevent new doorbell, and start command abort */ + xhci->cmd_ring_state = CMD_RING_STATE_ABORTED;
nice fix setting the cmd_ring_state before releasing the lock.
spin_unlock_irqrestore(&xhci->lock, flags); xhci_dbg(xhci, "Command timeout\n"); ret = xhci_abort_cmd_ring(xhci); @@ -1297,10 +1305,10 @@ void xhci_handle_command_timeout(struct return; } - /* command ring failed to restart, or host removed. Bail out */ - if (second_timeout || xhci->xhc_state & XHCI_STATE_REMOVING) { + /* host removed. Bail out */ + if (xhci->xhc_state & XHCI_STATE_REMOVING) { spin_unlock_irqrestore(&xhci->lock, flags); - xhci_dbg(xhci, "command timed out twice, ring start fail?\n"); + xhci_dbg(xhci, "host removed, ring start fail?\n"); xhci_cleanup_command_queue(xhci); return; } @@ -1347,7 +1355,7 @@ static void handle_cmd_completion(struct /* If CMD ring stopped we own the trbs between enqueue and dequeue */ if (cmd_comp_code == COMP_CMD_STOP) { - xhci_handle_stopped_cmd_ring(xhci, cmd); + complete_all(&xhci->stop_completion); return; } diff -puN drivers/usb/host/xhci.h~xhci-fix-abort-race2 drivers/usb/host/xhci.h --- xhci/drivers/usb/host/xhci.h~xhci-fix-abort-race2 2016-11-16 18:38:52.552146764 +0900 +++ xhci-hirofumi/drivers/usb/host/xhci.h 2016-11-16 18:38:52.554146762 +0900 @@ -1574,6 +1574,7 @@ struct xhci_hcd { struct list_head cmd_list; unsigned int cmd_ring_reserved_trbs; struct delayed_work cmd_timer; + struct completion stop_completion;
Tiny thing, but maybe change stop_completion to cmd_ring_stop_completion. to avoid mixing it with stopping host controller, stop endpoint, stop device etc -Mathias -- To unsubscribe from this list: send the line "unsubscribe linux-usb" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html