This is a note to let you know that I've just added the patch titled RDMA/bnxt_re: Avoid the command wait if firmware is inactive to the 6.4-stable tree which can be found at: http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary The filename of the patch is: rdma-bnxt_re-avoid-the-command-wait-if-firmware-is-i.patch and it can be found in the queue-6.4 subdirectory. If you, or anyone else, feels it should not be added to the stable tree, please let <stable@xxxxxxxxxxxxxxx> know about it. commit 15a3a71e5ca04ee2b1ab2e62f847209400a429e2 Author: Kashyap Desai <kashyap.desai@xxxxxxxxxxxx> Date: Fri Jun 9 04:01:43 2023 -0700 RDMA/bnxt_re: Avoid the command wait if firmware is inactive [ Upstream commit 3022cc15119733cebaef05feddb5d87b9e401c0e ] Add a check to avoid waiting if driver already detects a FW timeout. Return success for resource destroy in case the device is detached. Add helper function to map timeout error code to success. Signed-off-by: Kashyap Desai <kashyap.desai@xxxxxxxxxxxx> Signed-off-by: Selvin Xavier <selvin.xavier@xxxxxxxxxxxx> Link: https://lore.kernel.org/r/1686308514-11996-7-git-send-email-selvin.xavier@xxxxxxxxxxxx Signed-off-by: Leon Romanovsky <leon@xxxxxxxxxx> Stable-dep-of: 29900bf351e1 ("RDMA/bnxt_re: Fix hang during driver unload") Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx> diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 918e588588885..bfa0f29c7abf4 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -53,10 +53,47 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t); +/** + * bnxt_qplib_map_rc - map return type based on opcode + * @opcode - roce slow path opcode + * + * In some cases like firmware halt is detected, the driver is supposed to + * remap the error code of the timed out command. + * + * It is not safe to assume hardware is really inactive so certain opcodes + * like destroy qp etc are not safe to be returned success, but this function + * will be called when FW already reports a timeout. This would be possible + * only when FW crashes and resets. This will clear all the HW resources. + * + * Returns: + * 0 to communicate success to caller. + * Non zero error code to communicate failure to caller. + */ +static int bnxt_qplib_map_rc(u8 opcode) +{ + switch (opcode) { + case CMDQ_BASE_OPCODE_DESTROY_QP: + case CMDQ_BASE_OPCODE_DESTROY_SRQ: + case CMDQ_BASE_OPCODE_DESTROY_CQ: + case CMDQ_BASE_OPCODE_DEALLOCATE_KEY: + case CMDQ_BASE_OPCODE_DEREGISTER_MR: + case CMDQ_BASE_OPCODE_DELETE_GID: + case CMDQ_BASE_OPCODE_DESTROY_QP1: + case CMDQ_BASE_OPCODE_DESTROY_AH: + case CMDQ_BASE_OPCODE_DEINITIALIZE_FW: + case CMDQ_BASE_OPCODE_MODIFY_ROCE_CC: + case CMDQ_BASE_OPCODE_SET_LINK_AGGR_MODE: + return 0; + default: + return -ETIMEDOUT; + } +} + /** * __wait_for_resp - Don't hold the cpu context and wait for response * @rcfw - rcfw channel instance of rdev * @cookie - cookie to track the command + * @opcode - rcfw submitted for given opcode * * Wait for command completion in sleepable context. * @@ -64,7 +101,7 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t); * 0 if command is completed by firmware. * Non zero error code for rest of the case. */ -static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) +static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) { struct bnxt_qplib_cmdq_ctx *cmdq; u16 cbit; @@ -74,6 +111,9 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) cbit = cookie % rcfw->cmdq_depth; do { + if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) + return bnxt_qplib_map_rc(opcode); + /* Non zero means command completed */ ret = wait_event_timeout(cmdq->waitq, !test_bit(cbit, cmdq->cmdq_bitmap), @@ -94,6 +134,7 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) * __block_for_resp - hold the cpu context and wait for response * @rcfw - rcfw channel instance of rdev * @cookie - cookie to track the command + * @opcode - rcfw submitted for given opcode * * This function will hold the cpu (non-sleepable context) and * wait for command completion. Maximum holding interval is 8 second. @@ -102,7 +143,7 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) * -ETIMEOUT if command is not completed in specific time interval. * 0 if command is completed by firmware. */ -static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) +static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) { struct bnxt_qplib_cmdq_ctx *cmdq = &rcfw->cmdq; unsigned long issue_time = 0; @@ -112,6 +153,9 @@ static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) issue_time = jiffies; do { + if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) + return bnxt_qplib_map_rc(opcode); + udelay(1); bnxt_qplib_service_creq(&rcfw->creq.creq_tasklet); @@ -267,9 +311,9 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, } while (retry_cnt--); if (msg->block) - rc = __block_for_resp(rcfw, cookie); + rc = __block_for_resp(rcfw, cookie, opcode); else - rc = __wait_for_resp(rcfw, cookie); + rc = __wait_for_resp(rcfw, cookie, opcode); if (rc) { /* timed out */ dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x timedout (%d)msec\n",