ping? OGAWA Hirofumi <hirofumi@xxxxxxxxxxxxxxxxxx> writes: > Current abort operation has race. > > xhci_handle_command_timeout() > xhci_abort_cmd_ring() > xhci_write_64(CMD_RING_ABORT) > xhci_handshake(5s) > do { > check CMD_RING_RUNNING > udelay(1) > ... > COMP_CMD_ABORT event > COMP_CMD_STOP event > xhci_handle_stopped_cmd_ring() > restart cmd_ring > CMD_RING_RUNNING become 1 again > } while () > return -ETIMEDOUT > xhci_write_64(CMD_RING_ABORT) > /* can abort random command */ > > To do abort operation correctly, we have to wait both of COMP_CMD_STOP > event and negation of CMD_RING_RUNNING. > > But like above, while timeout handler is waiting negation of > CMD_RING_RUNNING, event handler can restart cmd_ring. So timeout > handler never be notice negation of CMD_RING_RUNNING, and retry of > CMD_RING_ABORT can abort random command (BTW, I guess retry of > CMD_RING_ABORT was workaround of this race). > > To fix this race, this moves xhci_handle_stopped_cmd_ring() to > xhci_abort_cmd_ring(). And timeout handler waits COMP_CMD_STOP event. > > At this point, timeout handler is owner of cmd_ring, and safely > restart cmd_ring by using xhci_handle_stopped_cmd_ring(). > > [FWIW, as bonus, this way would be easily extend to add CMD_RING_PAUSE > operation] > > Signed-off-by: OGAWA Hirofumi <hirofumi@xxxxxxxxxxxxxxxxxx> > --- > > drivers/usb/host/xhci-mem.c | 1 > drivers/usb/host/xhci-ring.c | 163 ++++++++++++++++++++++-------------------- > drivers/usb/host/xhci.h | 1 > 3 files changed, 88 insertions(+), 77 deletions(-) > > diff -puN drivers/usb/host/xhci-mem.c~xhci-fix-abort-race2 drivers/usb/host/xhci-mem.c > --- xhci/drivers/usb/host/xhci-mem.c~xhci-fix-abort-race2 2016-11-16 18:38:52.551146764 +0900 > +++ xhci-hirofumi/drivers/usb/host/xhci-mem.c 2016-11-17 01:04:11.402014445 +0900 > @@ -2344,6 +2344,7 @@ int xhci_mem_init(struct xhci_hcd *xhci, > > /* init command timeout work */ > INIT_DELAYED_WORK(&xhci->cmd_timer, xhci_handle_command_timeout); > + init_completion(&xhci->cmd_ring_stop_completion); > > page_size = readl(&xhci->op_regs->page_size); > xhci_dbg_trace(xhci, trace_xhci_dbg_init, > diff -puN drivers/usb/host/xhci-ring.c~xhci-fix-abort-race2 drivers/usb/host/xhci-ring.c > --- xhci/drivers/usb/host/xhci-ring.c~xhci-fix-abort-race2 2016-11-16 18:38:52.552146764 +0900 > +++ xhci-hirofumi/drivers/usb/host/xhci-ring.c 2016-11-17 01:04:06.393018485 +0900 > @@ -284,6 +284,60 @@ static bool xhci_mod_cmd_timer(struct xh > return mod_delayed_work(system_wq, &xhci->cmd_timer, delay); > } > > +static struct xhci_command *xhci_next_queued_cmd(struct xhci_hcd *xhci) > +{ > + return list_first_entry_or_null(&xhci->cmd_list, struct xhci_command, > + cmd_list); > +} > + > +/* > + * Turn all commands on command ring with status set to "aborted" to no-op trbs. > + * If there are other commands waiting then restart the ring and kick the timer. > + * This must be called with command ring stopped and xhci->lock held. > + */ > +static void xhci_handle_stopped_cmd_ring(struct xhci_hcd *xhci, > + struct xhci_command *cur_cmd) > +{ > + struct xhci_command *i_cmd; > + u32 cycle_state; > + > + /* Turn all aborted commands in list to no-ops, then restart */ > + list_for_each_entry(i_cmd, &xhci->cmd_list, cmd_list) { > + > + if (i_cmd->status != COMP_CMD_ABORT) > + continue; > + > + i_cmd->status = COMP_CMD_STOP; > + > + xhci_dbg(xhci, "Turn aborted command %p to no-op\n", > + i_cmd->command_trb); > + /* get cycle state from the original cmd trb */ > + cycle_state = le32_to_cpu( > + i_cmd->command_trb->generic.field[3]) & TRB_CYCLE; > + /* modify the command trb to no-op command */ > + i_cmd->command_trb->generic.field[0] = 0; > + i_cmd->command_trb->generic.field[1] = 0; > + i_cmd->command_trb->generic.field[2] = 0; > + i_cmd->command_trb->generic.field[3] = cpu_to_le32( > + TRB_TYPE(TRB_CMD_NOOP) | cycle_state); > + > + /* > + * caller waiting for completion is called when command > + * completion event is received for these no-op commands > + */ > + } > + > + xhci->cmd_ring_state = CMD_RING_STATE_RUNNING; > + > + /* ring command ring doorbell to restart the command ring */ > + if ((xhci->cmd_ring->dequeue != xhci->cmd_ring->enqueue) && > + !(xhci->xhc_state & XHCI_STATE_DYING)) { > + xhci->current_cmd = cur_cmd; > + xhci_mod_cmd_timer(xhci, XHCI_CMD_DEFAULT_TIMEOUT); > + xhci_ring_cmd_db(xhci); > + } > +} > + > static int xhci_abort_cmd_ring(struct xhci_hcd *xhci) > { > u64 temp_64; > @@ -291,16 +345,9 @@ static int xhci_abort_cmd_ring(struct xh > > xhci_dbg(xhci, "Abort command ring\n"); > > - temp_64 = xhci_read_64(xhci, &xhci->op_regs->cmd_ring); > - xhci->cmd_ring_state = CMD_RING_STATE_ABORTED; > + reinit_completion(&xhci->cmd_ring_stop_completion); > > - /* > - * Writing the CMD_RING_ABORT bit should cause a cmd completion event, > - * however on some host hw the CMD_RING_RUNNING bit is correctly cleared > - * but the completion event in never sent. Use the cmd timeout timer to > - * handle those cases. Use twice the time to cover the bit polling retry > - */ > - xhci_mod_cmd_timer(xhci, 2 * XHCI_CMD_DEFAULT_TIMEOUT); > + temp_64 = xhci_read_64(xhci, &xhci->op_regs->cmd_ring); > xhci_write_64(xhci, temp_64 | CMD_RING_ABORT, > &xhci->op_regs->cmd_ring); > > @@ -320,15 +367,30 @@ static int xhci_abort_cmd_ring(struct xh > udelay(1000); > ret = xhci_handshake(&xhci->op_regs->cmd_ring, > CMD_RING_RUNNING, 0, 3 * 1000 * 1000); > - if (ret == 0) > - return 0; > + if (ret < 0) { > + xhci_err(xhci, "Stopped the command ring failed, " > + "maybe the host is dead\n"); > + xhci->xhc_state |= XHCI_STATE_DYING; > + xhci_halt(xhci); > + return -ESHUTDOWN; > + } > + } > > - xhci_err(xhci, "Stopped the command ring failed, " > - "maybe the host is dead\n"); > - cancel_delayed_work(&xhci->cmd_timer); > - xhci->xhc_state |= XHCI_STATE_DYING; > - xhci_halt(xhci); > - return -ESHUTDOWN; > + /* > + * Writing the CMD_RING_ABORT bit should cause a cmd completion event, > + * however on some host hw the CMD_RING_RUNNING bit is correctly cleared > + * but the completion event in never sent. Wait 2 secs (arbitrary > + * number) to handle those cases after negation of CMD_RING_RUNNING. > + */ > + if (!wait_for_completion_timeout(&xhci->cmd_ring_stop_completion, > + 2 * HZ)) { > + xhci_dbg(xhci, "No stop event for abort, ring start fail?\n"); > + xhci_cleanup_command_queue(xhci); > + } else { > + unsigned long flags; > + spin_lock_irqsave(&xhci->lock, flags); > + xhci_handle_stopped_cmd_ring(xhci, xhci_next_queued_cmd(xhci)); > + spin_unlock_irqrestore(&xhci->lock, flags); > } > > return 0; > @@ -1212,79 +1274,26 @@ void xhci_cleanup_command_queue(struct x > xhci_complete_del_and_free_cmd(cur_cmd, COMP_CMD_ABORT); > } > > -/* > - * Turn all commands on command ring with status set to "aborted" to no-op trbs. > - * If there are other commands waiting then restart the ring and kick the timer. > - * This must be called with command ring stopped and xhci->lock held. > - */ > -static void xhci_handle_stopped_cmd_ring(struct xhci_hcd *xhci, > - struct xhci_command *cur_cmd) > -{ > - struct xhci_command *i_cmd, *tmp_cmd; > - u32 cycle_state; > - > - /* Turn all aborted commands in list to no-ops, then restart */ > - list_for_each_entry_safe(i_cmd, tmp_cmd, &xhci->cmd_list, > - cmd_list) { > - > - if (i_cmd->status != COMP_CMD_ABORT) > - continue; > - > - i_cmd->status = COMP_CMD_STOP; > - > - xhci_dbg(xhci, "Turn aborted command %p to no-op\n", > - i_cmd->command_trb); > - /* get cycle state from the original cmd trb */ > - cycle_state = le32_to_cpu( > - i_cmd->command_trb->generic.field[3]) & TRB_CYCLE; > - /* modify the command trb to no-op command */ > - i_cmd->command_trb->generic.field[0] = 0; > - i_cmd->command_trb->generic.field[1] = 0; > - i_cmd->command_trb->generic.field[2] = 0; > - i_cmd->command_trb->generic.field[3] = cpu_to_le32( > - TRB_TYPE(TRB_CMD_NOOP) | cycle_state); > - > - /* > - * caller waiting for completion is called when command > - * completion event is received for these no-op commands > - */ > - } > - > - xhci->cmd_ring_state = CMD_RING_STATE_RUNNING; > - > - /* ring command ring doorbell to restart the command ring */ > - if ((xhci->cmd_ring->dequeue != xhci->cmd_ring->enqueue) && > - !(xhci->xhc_state & XHCI_STATE_DYING)) { > - xhci->current_cmd = cur_cmd; > - xhci_mod_cmd_timer(xhci, XHCI_CMD_DEFAULT_TIMEOUT); > - xhci_ring_cmd_db(xhci); > - } > - return; > -} > - > - > void xhci_handle_command_timeout(struct work_struct *work) > { > struct xhci_hcd *xhci; > int ret; > unsigned long flags; > u64 hw_ring_state; > - bool second_timeout = false; > > xhci = container_of(to_delayed_work(work), struct xhci_hcd, cmd_timer); > > /* mark this command to be cancelled */ > spin_lock_irqsave(&xhci->lock, flags); > - if (xhci->current_cmd) { > - if (xhci->current_cmd->status == COMP_CMD_ABORT) > - second_timeout = true; > + if (xhci->current_cmd) > xhci->current_cmd->status = COMP_CMD_ABORT; > - } > > /* Make sure command ring is running before aborting it */ > hw_ring_state = xhci_read_64(xhci, &xhci->op_regs->cmd_ring); > if ((xhci->cmd_ring_state & CMD_RING_STATE_RUNNING) && > (hw_ring_state & CMD_RING_RUNNING)) { > + /* Prevent new doorbell, and start command abort */ > + xhci->cmd_ring_state = CMD_RING_STATE_ABORTED; > spin_unlock_irqrestore(&xhci->lock, flags); > xhci_dbg(xhci, "Command timeout\n"); > ret = xhci_abort_cmd_ring(xhci); > @@ -1297,10 +1306,10 @@ void xhci_handle_command_timeout(struct > return; > } > > - /* command ring failed to restart, or host removed. Bail out */ > - if (second_timeout || xhci->xhc_state & XHCI_STATE_REMOVING) { > + /* host removed. Bail out */ > + if (xhci->xhc_state & XHCI_STATE_REMOVING) { > spin_unlock_irqrestore(&xhci->lock, flags); > - xhci_dbg(xhci, "command timed out twice, ring start fail?\n"); > + xhci_dbg(xhci, "host removed, ring start fail?\n"); > xhci_cleanup_command_queue(xhci); > return; > } > @@ -1347,7 +1356,7 @@ static void handle_cmd_completion(struct > > /* If CMD ring stopped we own the trbs between enqueue and dequeue */ > if (cmd_comp_code == COMP_CMD_STOP) { > - xhci_handle_stopped_cmd_ring(xhci, cmd); > + complete_all(&xhci->cmd_ring_stop_completion); > return; > } > > diff -puN drivers/usb/host/xhci.h~xhci-fix-abort-race2 drivers/usb/host/xhci.h > --- xhci/drivers/usb/host/xhci.h~xhci-fix-abort-race2 2016-11-16 18:38:52.552146764 +0900 > +++ xhci-hirofumi/drivers/usb/host/xhci.h 2016-11-17 01:04:06.393018485 +0900 > @@ -1574,6 +1574,7 @@ struct xhci_hcd { > struct list_head cmd_list; > unsigned int cmd_ring_reserved_trbs; > struct delayed_work cmd_timer; > + struct completion cmd_ring_stop_completion; > struct xhci_command *current_cmd; > struct xhci_ring *event_ring; > struct xhci_erst erst; > _ -- OGAWA Hirofumi <hirofumi@xxxxxxxxxxxxxxxxxx> -- To unsubscribe from this list: send the line "unsubscribe linux-usb" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html