From: Shyam Sundar <ssundar@xxxxxxxxxxx> Here is the relevant logs for the problem we are solving: qedf_flush_active_ios:1707]:3: Flush active i/o's num=0x17 fcport=0xffff948168fbcc80 port_id=0x550200 scsi_id=0. qedf_flush_active_ios:1708]:3: Locking flush mutex. qedf_flush_active_ios:1758]:3: Not outstanding, xid=0xaaf, cmd_type=3 refcount=1. qedf_flush_active_ios:1896]:3: Flushed 0x16 I/Os, active=0x1. qedf_flush_active_ios:1901]:3: Flushed 0x16 I/Os, active=0x1 cnt=60. qedf_send_rrq:295]:3: Sending RRQ orig io = ffffb48b8f7d7158, orig_xid = 0xaaf qedf_initiate_els:37]:3: Sending ELS qedf_initiate_els:68]:3: initiate_els els_req = 0xffffb48b8f6d3098 cb_arg = ffff948fd5e4de80 xid = 4c6 qedf_init_mp_req:2172]:3: Entered. qedf_init_mp_task:727]:3: Initializing MP task for cmd_type=4 qedf_initiate_els:134]:3: Ringing doorbell for ELS req qedf_flush_active_ios:1901]:3: Flushed 0x16 I/Os, active=0x2 cnt=20. qedf_cmd_timeout:96]:3: ELS timeout, xid=0x4c6. qedf_rrq_compl:186]:3: Entered. qedf_rrq_compl:204]:3: rrq_compl: orig io = ffffb48b8f7d7158, orig xid = 0xaaf, rrq_xid = 0x4c6, refcount=1 qedf_flush_active_ios:1935]:3: Unlocking flush mutex. qedf_upload_connection:1579]:3: Uploading connection port_id=550200. We found an ABTS command for which CMD_OUTSTANDING was cleared (line 3). For this command, delayed send_rrq was queued, but would take 10 secs to execute. Adding capability to detect that (based on io_req->state that is being introduced), and attempt to cancel rrq_work. If we succeed, we drop the reference and free the io_req. If we cannot, then the els will get sent out and we will wait for 10 secs for it to complete. Signed-off-by: Shyam Sundar <ssundar@xxxxxxxxxxx> Signed-off-by: Saurav Kashyap <skashyap@xxxxxxxxxxx> --- drivers/scsi/qedf/qedf.h | 17 +++++++++++++++ drivers/scsi/qedf/qedf_io.c | 53 +++++++++++++++++++++++++++++++-------------- 2 files changed, 54 insertions(+), 16 deletions(-) diff --git a/drivers/scsi/qedf/qedf.h b/drivers/scsi/qedf/qedf.h index 1e0d142..f647817 100644 --- a/drivers/scsi/qedf/qedf.h +++ b/drivers/scsi/qedf/qedf.h @@ -119,9 +119,26 @@ struct qedf_ioreq { u8 io_req_flags; uint8_t tm_flags; struct qedf_rport *fcport; +#define QEDF_CMD_ST_INACTIVE 0 +#define QEDFC_CMD_ST_IO_ACTIVE 1 +#define QEDFC_CMD_ST_ABORT_ACTIVE 2 +#define QEDFC_CMD_ST_ABORT_ACTIVE_EH 3 +#define QEDFC_CMD_ST_CLEANUP_ACTIVE 4 +#define QEDFC_CMD_ST_CLEANUP_ACTIVE_EH 5 +#define QEDFC_CMD_ST_RRQ_ACTIVE 6 +#define QEDFC_CMD_ST_RRQ_WAIT 7 +#define QEDFC_CMD_ST_OXID_RETIRE_WAIT 8 +#define QEDFC_CMD_ST_TMF_ACTIVE 9 +#define QEDFC_CMD_ST_DRAIN_ACTIVE 10 +#define QEDFC_CMD_ST_CLEANED 11 +#define QEDFC_CMD_ST_ELS_ACTIVE 12 + atomic_t state; unsigned long flags; enum qedf_ioreq_event event; size_t data_xfer_len; + /* ID: 001: Alloc cmd (qedf_alloc_cmd) */ + /* ID: 002: Initiate ABTS (qedf_initiate_abts) */ + /* ID: 003: For RRQ (qedf_process_abts_compl) */ struct kref refcount; struct qedf_cmd_mgr *cmd_mgr; struct io_bdt *bd_tbl; diff --git a/drivers/scsi/qedf/qedf_io.c b/drivers/scsi/qedf/qedf_io.c index ca7ca8c..db5d49e 100644 --- a/drivers/scsi/qedf/qedf_io.c +++ b/drivers/scsi/qedf/qedf_io.c @@ -170,6 +170,7 @@ static void qedf_handle_rrq(struct work_struct *work) struct qedf_ioreq *io_req = container_of(work, struct qedf_ioreq, rrq_work.work); + atomic_set(&io_req->state, QEDFC_CMD_ST_RRQ_ACTIVE); qedf_send_rrq(io_req); } @@ -374,7 +375,8 @@ struct qedf_ioreq *qedf_alloc_cmd(struct qedf_rport *fcport, u8 cmd_type) io_req->lun = -1; /* Hold the io_req against deletion */ - kref_init(&io_req->refcount); + kref_init(&io_req->refcount); /* ID: 001 */ + atomic_set(&io_req->state, QEDFC_CMD_ST_IO_ACTIVE); /* Bind io_bdt for this io_req */ /* Have a static link between io_req and io_bdt_pool */ @@ -447,6 +449,7 @@ void qedf_release_cmd(struct kref *ref) atomic_inc(&cmd_mgr->free_list_cnt); atomic_dec(&fcport->num_active_ios); + atomic_set(&io_req->state, QEDF_CMD_ST_INACTIVE); if (atomic_read(&fcport->num_active_ios) < 0) QEDF_WARN(&(fcport->qedf->dbg_ctx), "active_ios < 0.\n"); @@ -1627,6 +1630,21 @@ void qedf_flush_active_ios(struct qedf_rport *fcport, int lun) QEDF_INFO(&qedf->dbg_ctx, QEDF_LOG_IO, "Not outstanding, xid=0x%x, cmd_type=%d refcount=%d.\n", io_req->xid, io_req->cmd_type, refcount); + /* If RRQ work has been queue, try to cancel it and + * free the io_req + */ + if (atomic_read(&io_req->state) == + QEDFC_CMD_ST_RRQ_WAIT) { + if (cancel_delayed_work_sync + (&io_req->rrq_work)) { + QEDF_INFO(&qedf->dbg_ctx, QEDF_LOG_IO, + "Putting reference for pending RRQ work xid=0x%x.\n", + io_req->xid); + /* ID: 003 */ + kref_put(&io_req->refcount, + qedf_release_cmd); + } + } continue; } @@ -1650,6 +1668,7 @@ void qedf_flush_active_ios(struct qedf_rport *fcport, int lun) } if (io_req->cmd_type == QEDF_ABTS) { + /* ID: 004 */ rc = kref_get_unless_zero(&io_req->refcount); if (!rc) { QEDF_ERR(&(qedf->dbg_ctx), @@ -1665,24 +1684,25 @@ void qedf_flush_active_ios(struct qedf_rport *fcport, int lun) if (cancel_delayed_work_sync(&io_req->rrq_work)) { QEDF_INFO(&qedf->dbg_ctx, QEDF_LOG_IO, - "Putting reference for pending RRQ work xid=0x%x.\n", + "Putting ref for cancelled RRQ work xid=0x%x.\n", io_req->xid); kref_put(&io_req->refcount, qedf_release_cmd); } - /* Cancel any timeout work */ - cancel_delayed_work_sync(&io_req->timeout_work); - - if (!test_bit(QEDF_CMD_IN_ABORT, &io_req->flags)) - goto free_cmd; - - qedf_initiate_cleanup(io_req, true); + if (cancel_delayed_work_sync(&io_req->timeout_work)) { + QEDF_INFO(&qedf->dbg_ctx, QEDF_LOG_IO, + "Putting ref for cancelled tmo work xid=0x%x.\n", + io_req->xid); + qedf_initiate_cleanup(io_req, true); + /* Notify eh_abort handler that ABTS is + * complete + */ + complete(&io_req->abts_done); + clear_bit(QEDF_CMD_IN_ABORT, &io_req->flags); + /* ID: 002 */ + kref_put(&io_req->refcount, qedf_release_cmd); + } flush_cnt++; - - /* Notify eh_abort handler that ABTS is complete */ - kref_put(&io_req->refcount, qedf_release_cmd); - complete(&io_req->abts_done); - goto free_cmd; } @@ -1722,7 +1742,7 @@ void qedf_flush_active_ios(struct qedf_rport *fcport, int lun) qedf_initiate_cleanup(io_req, true); free_cmd: - kref_put(&io_req->refcount, qedf_release_cmd); + kref_put(&io_req->refcount, qedf_release_cmd); /* ID: 004 */ } wait_cnt = 60; @@ -1929,7 +1949,7 @@ void qedf_process_abts_compl(struct qedf_ctx *qedf, struct fcoe_cqe *cqe, QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_SCSI_TM, "ABTS response - ACC Send RRQ after R_A_TOV\n"); io_req->event = QEDF_IOREQ_EV_ABORT_SUCCESS; - rc = kref_get_unless_zero(&io_req->refcount); + rc = kref_get_unless_zero(&io_req->refcount); /* ID: 003 */ if (!rc) { QEDF_INFO(&qedf->dbg_ctx, QEDF_LOG_SCSI_TM, "kref is already zero so ABTS was already completed or flushed xid=0x%x.\n", @@ -1942,6 +1962,7 @@ void qedf_process_abts_compl(struct qedf_ctx *qedf, struct fcoe_cqe *cqe, */ queue_delayed_work(qedf->dpc_wq, &io_req->rrq_work, msecs_to_jiffies(qedf->lport->r_a_tov)); + atomic_set(&io_req->state, QEDFC_CMD_ST_RRQ_WAIT); break; /* For error cases let the cleanup return the command */ case FC_RCTL_BA_RJT: -- 1.8.3.1